[llvm] [AMDGPU] Infer amdgpu-no-flat-scratch-init attribute in AMDGPUAttributor (PR #94647)

Jun Wang via llvm-commits llvm-commits at lists.llvm.org
Thu Jun 6 10:54:23 PDT 2024


https://github.com/jwanggit86 created https://github.com/llvm/llvm-project/pull/94647

The AMDGPUAnnotateKernelFeatures pass infers the "amdgpu-calls" and "amdgpu-stack-objects" attributes, which are used to infer whether we need to initialize flat scratch. This is, however, not precise. Instead, we should use AMDGPUAttributor and infer amdgpu-no-flat-scratch-init on kernels. Refer to https://github.com/llvm/llvm-project/issues/63586 .

>From b28a77a2a77317a7e674c6f9c03531b52491e6c2 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Thu, 6 Jun 2024 10:45:12 -0700
Subject: [PATCH] [AMDGPU] Infer amdgpu-no-flat-scratch-init attribute in
 AMDGPUAttributor

The AMDGPUAnnotateKernelFeatures pass infers the "amdgpu-calls" and
"amdgpu-stack-objects" attributes, which are used to infer whether we need to
initialize flat scratch. This is, however, not precise. Instead, we should use
AMDGPUAttributor and infer amdgpu-no-flat-scratch-init on kernels.
---
 llvm/lib/Target/AMDGPU/AMDGPUAttributes.def   |    1 +
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |   43 +
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |   10 +-
 ...licit-kernarg-backend-usage-global-isel.ll |   18 +-
 .../AMDGPU/addrspacecast-constantexpr.ll      |    6 +-
 .../AMDGPU/amdgpu-attributor-no-agpr.ll       |   21 +-
 .../annotate-kernel-features-hsa-call.ll      |  112 +-
 .../AMDGPU/annotate-kernel-features-hsa.ll    |   48 +-
 .../AMDGPU/annotate-kernel-features.ll        |   18 +-
 .../attributor-flatscratchinit-globalisel.ll  | 1028 ++++++++++++
 .../AMDGPU/attributor-flatscratchinit.ll      |  914 +++++++++++
 llvm/test/CodeGen/AMDGPU/attributor-noopt.ll  |    2 +-
 .../AMDGPU/call-graph-register-usage.ll       |    8 +-
 .../callee-special-input-sgprs-fixed-abi.ll   |   36 +-
 .../CodeGen/AMDGPU/direct-indirect-call.ll    |    2 +-
 .../AMDGPU/duplicate-attribute-indirect.ll    |    2 +-
 .../test/CodeGen/AMDGPU/flat-address-space.ll |    8 +-
 .../AMDGPU/implicit-kernarg-backend-usage.ll  |   19 +-
 .../AMDGPU/implicitarg-offset-attributes.ll   |   30 +-
 llvm/test/CodeGen/AMDGPU/ipra.ll              |    2 +-
 llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll  |   96 +-
 .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll       |   17 +-
 .../AMDGPU/lower-module-lds-via-hybrid.ll     |   15 +-
 .../AMDGPU/lower-module-lds-via-table.ll      |   15 +-
 .../AMDGPU/memory-legalizer-flat-agent.ll     | 1380 +++++++++++++++++
 .../memory-legalizer-flat-nontemporal.ll      |   75 +
 .../memory-legalizer-flat-singlethread.ll     | 1380 +++++++++++++++++
 .../AMDGPU/memory-legalizer-flat-system.ll    | 1380 +++++++++++++++++
 .../AMDGPU/memory-legalizer-flat-volatile.ll  |   66 +
 .../AMDGPU/memory-legalizer-flat-wavefront.ll | 1365 ++++++++++++++++
 .../AMDGPU/memory-legalizer-flat-workgroup.ll | 1320 ++++++++++++++++
 .../AMDGPU/memory-legalizer-global-agent.ll   |  273 ++++
 .../memory-legalizer-global-nontemporal.ll    |   15 +
 .../memory-legalizer-global-singlethread.ll   |  276 ++++
 .../AMDGPU/memory-legalizer-global-system.ll  |  261 ++++
 .../memory-legalizer-global-volatile.ll       |   18 +
 .../memory-legalizer-global-wavefront.ll      |  276 ++++
 .../memory-legalizer-global-workgroup.ll      |  276 ++++
 .../memory-legalizer-local-nontemporal.ll     |    9 +
 .../AMDGPU/memory-legalizer-local-volatile.ll |    6 +
 .../memory-legalizer-private-nontemporal.ll   |   59 +-
 .../memory-legalizer-private-volatile.ll      |   30 +-
 .../AMDGPU/propagate-flat-work-group-size.ll  |   18 +-
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  |   44 +-
 .../AMDGPU/recursive_global_initializer.ll    |    2 +-
 .../AMDGPU/remove-no-kernel-id-attribute.ll   |    8 +-
 .../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll     |    2 +-
 .../CodeGen/AMDGPU/simple-indirect-call.ll    |    2 +-
 .../uniform-work-group-attribute-missing.ll   |    4 +-
 .../AMDGPU/uniform-work-group-multistep.ll    |    4 +-
 ...niform-work-group-nested-function-calls.ll |    4 +-
 ...ork-group-prevent-attribute-propagation.ll |    6 +-
 .../uniform-work-group-propagate-attribute.ll |    4 +-
 .../uniform-work-group-recursion-test.ll      |    6 +-
 .../CodeGen/AMDGPU/uniform-work-group-test.ll |    4 +-
 .../AMDGPU/vgpr-spill-placement-issue61083.ll |    2 +-
 56 files changed, 10689 insertions(+), 357 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
index bacc8e4e821e5..8c1c8219690ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
@@ -30,5 +30,6 @@ AMDGPU_ATTRIBUTE(WORKITEM_ID_Z, "amdgpu-no-workitem-id-z")
 AMDGPU_ATTRIBUTE(LDS_KERNEL_ID, "amdgpu-no-lds-kernel-id")
 AMDGPU_ATTRIBUTE(DEFAULT_QUEUE, "amdgpu-no-default-queue")
 AMDGPU_ATTRIBUTE(COMPLETION_ACTION, "amdgpu-no-completion-action")
+AMDGPU_ATTRIBUTE(FLAT_SCRATCH_INIT, "amdgpu-no-flat-scratch-init")
 
 #undef AMDGPU_ATTRIBUTE
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 43bfd0f13f875..8bdc9eab577a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -433,6 +433,19 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       indicatePessimisticFixpoint();
       return;
     }
+
+    bool HasAllocaOrASCast = false;
+    for (BasicBlock &BB : *F) {
+      for (Instruction &I : BB) {
+        if (isa<AllocaInst>(I) || isa<AddrSpaceCastInst>(I)) {
+          HasAllocaOrASCast = true;
+          removeAssumedBits(FLAT_SCRATCH_INIT);
+          break;
+        }
+      }
+      if (HasAllocaOrASCast)
+        break;
+    }
   }
 
   ChangeStatus updateImpl(Attributor &A) override {
@@ -519,6 +532,9 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
       removeAssumedBits(COMPLETION_ACTION);
 
+    if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
+      removeAssumedBits(FLAT_SCRATCH_INIT);
+
     return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
                                        : ChangeStatus::UNCHANGED;
   }
@@ -677,6 +693,33 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
                                               UsedAssumedInformation);
   }
+
+  // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
+  // not to be set.
+  bool needFlatScratchInit(Attributor &A) {
+    // This is called on each callee; false means callee shouldn't have
+    // no-flat-scratch-init.
+    auto CheckForNoFlatScratchInit = [&](Instruction &I) {
+      const auto &CB = cast<CallBase>(I);
+      const Value *CalleeOp = CB.getCalledOperand();
+      const Function *Callee = dyn_cast<Function>(CalleeOp);
+      if (!Callee) // indirect call
+        return CB.isInlineAsm();
+
+      if (Callee->isIntrinsic())
+        return true;
+
+      const auto *CalleeInfo = A.getAAFor<AAAMDAttributes>(
+          *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
+      return CalleeInfo && CalleeInfo->isAssumed(FLAT_SCRATCH_INIT);
+    };
+
+    bool UsedAssumedInformation = false;
+    // If any callee is false (i.e. need FlatScratchInit),
+    // checkForAllCallLikeInstructions returns false
+    return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
+                                              UsedAssumedInformation);
+  }
 };
 
 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 94ee4ac78142d..511e711bf724d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -1040,12 +1040,8 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
   const CallingConv::ID CC = F.getCallingConv();
   const bool IsKernel =
       CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
-  // FIXME: Should have analysis or something rather than attribute to detect
-  // calls.
-  const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
-  // FIXME: This attribute is a hack, we just need an analysis on the function
-  // to look for allocas.
-  const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
+  const bool NoFlatScratchInit =
+      F.hasFnAttribute("amdgpu-no-flat-scratch-init");
 
   if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
     KernargSegmentPtr = true;
@@ -1073,7 +1069,7 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
   // lowering.
   if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
       (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
-      (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
+      (!NoFlatScratchInit || ST.enableFlatScratch()) &&
       !ST.flatScratchIsArchitected()) {
     FlatScratchInit = true;
   }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 8859ac69923a9..74ba17d4bf59d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -12,7 +12,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x40
-; GFX8V4-NEXT:    v_mov_b32_e32 v2, 1
+; GFX8V4-NEXT:    s_add_i32 s8, s8, s11
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s9
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_mov_b32 s4, s0
 ; GFX8V4-NEXT:    s_mov_b32 s5, s3
@@ -23,6 +25,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], s[6:7], 0
+; GFX8V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
@@ -37,7 +40,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xc8
-; GFX8V5-NEXT:    v_mov_b32_e32 v2, 1
+; GFX8V5-NEXT:    s_add_i32 s6, s6, s9
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_mov_b32 s4, s0
 ; GFX8V5-NEXT:    s_mov_b32 s5, s2
@@ -47,6 +52,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], s[2:3], 0
+; GFX8V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
@@ -60,9 +66,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX9V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_mov_b32 s2, s0
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
@@ -71,6 +78,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V4-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
+; GFX9V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
@@ -84,9 +92,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
 ; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX9V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_mov_b32 s2, s0
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
@@ -95,6 +104,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V5-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
+; GFX9V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index cff9ce0506679..96bbcb7ed2149 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -233,9 +233,9 @@ attributes #1 = { nounwind }
 ; AKF_HSA: attributes #[[ATTR1]] = { nounwind }
 ;.
 ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index 33b1cc65dc569..5ace66fd2dd76 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -116,7 +116,7 @@ define amdgpu_kernel void @kernel_calls_extern() {
 define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
 ; CHECK-SAME: ) #[[ATTR4]] {
-; CHECK-NEXT:    call void @unknown() #[[ATTR9:[0-9]+]]
+; CHECK-NEXT:    call void @unknown() #[[ATTR10:[0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
   call void @unknown() #0
@@ -136,7 +136,7 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
 define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
 ; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] {
-; CHECK-NEXT:    call void [[INDIRECT]]() #[[ATTR9]]
+; CHECK-NEXT:    call void [[INDIRECT]]() #[[ATTR10]]
 ; CHECK-NEXT:    ret void
 ;
   call void %indirect() #0
@@ -229,7 +229,7 @@ define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) {
 
 define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 ; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr(
-; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR7:[0-9]+]] {
 ; CHECK-NEXT:    [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty
 ; CHECK-NEXT:    call void [[FPTR]]()
 ; CHECK-NEXT:    ret void
@@ -242,14 +242,15 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 
 attributes #0 = { "amdgpu-no-agpr" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR4]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-no-agpr" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index 3d4ae84d9c698..23294eacbe6cb 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -530,7 +530,7 @@ define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null)
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -545,7 +545,7 @@ define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null)
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -593,7 +593,7 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; ATTRIBUTOR_HSA-NEXT:    store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -611,7 +611,7 @@ define void @use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; ATTRIBUTOR_HSA-NEXT:    store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -628,7 +628,7 @@ define void @func_indirect_use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_implicitarg_ptr()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -645,7 +645,7 @@ define internal void @defined.func() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@defined.func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -658,7 +658,7 @@ define void @func_call_external() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_external
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @external.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -673,7 +673,7 @@ define void @func_call_defined() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_defined
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @defined.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -687,8 +687,8 @@ define void @func_call_asm() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
-; ATTRIBUTOR_HSA-NEXT:    call void asm sideeffect "", ""() #[[ATTR28:[0-9]+]]
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] {
+; ATTRIBUTOR_HSA-NEXT:    call void asm sideeffect "", ""() #[[ATTR31:[0-9]+]]
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   call void asm sideeffect "", ""() #3
@@ -702,7 +702,7 @@ define amdgpu_kernel void @kern_call_external() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_external
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @external.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -717,7 +717,7 @@ define amdgpu_kernel void @func_kern_defined() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_kern_defined
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @defined.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -769,7 +769,7 @@ define float @func_indirect_call(ptr %fptr) #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call
-; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR19]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float [[FPTR]]()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -788,7 +788,7 @@ define float @func_extern_call() #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @extern()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -806,7 +806,7 @@ define float @func_null_call(ptr %fptr) #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call
-; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR19]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float null()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -827,7 +827,7 @@ define float @func_other_intrinsic_call(float %arg) #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call
-; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR17]] {
+; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR20]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]])
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -845,7 +845,7 @@ define amdgpu_kernel void @kern_sanitize_address() #4 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) null, align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -861,7 +861,7 @@ define void @func_sanitize_address() #4 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) null, align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -877,7 +877,7 @@ define void @func_indirect_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR24:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -893,7 +893,7 @@ define amdgpu_kernel void @kern_indirect_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -912,7 +912,7 @@ define amdgpu_kernel void @kern_decl_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR21]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @extern_func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -928,7 +928,7 @@ define internal void @enqueue_block_def() #6 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR28:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -941,7 +941,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR26:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR29:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @enqueue_block_decl()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -956,7 +956,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_def() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR30:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @enqueue_block_def()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -969,7 +969,7 @@ define void @unused_enqueue_block() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR30]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -980,7 +980,7 @@ define internal void @known_func() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@known_func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR30]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -994,8 +994,8 @@ define amdgpu_kernel void @kern_callsite_enqueue_block() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] {
-; ATTRIBUTOR_HSA-NEXT:    call void @known_func() #[[ATTR29:[0-9]+]]
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR30]] {
+; ATTRIBUTOR_HSA-NEXT:    call void @known_func() #[[ATTR32:[0-9]+]]
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   call void @known_func() #6
@@ -1024,36 +1024,40 @@ attributes #6 = { "enqueued-block" }
 ; AKF_HSA: attributes #[[ATTR7]] = { "enqueued-block" }
 ; AKF_HSA: attributes #[[ATTR8]] = { "amdgpu-calls" }
 ;.
+
 ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR23:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR24:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR27]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { nounwind }
-; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "enqueued-block" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR24]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR26:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR27:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR30]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR31]] = { nounwind }
+; ATTRIBUTOR_HSA: attributes #[[ATTR32]] = { "enqueued-block" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 43cdf85ed3818..cfd02d1fc733c 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -454,7 +454,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 
 define amdgpu_kernel void @use_flat_to_group_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_group_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
 ; HSA-NEXT:    store volatile i32 0, ptr addrspace(3) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -466,7 +466,7 @@ define amdgpu_kernel void @use_flat_to_group_addrspacecast(ptr %ptr) #1 {
 
 define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_private_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
 ; HSA-NEXT:    store volatile i32 0, ptr addrspace(5) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -479,7 +479,7 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) #1 {
 ; No-op addrspacecast should not use queue ptr
 define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
-; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR13]] {
 ; HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
 ; HSA-NEXT:    store volatile i32 0, ptr [[STOF]], align 4
 ; HSA-NEXT:    ret void
@@ -491,7 +491,7 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
 
 define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
-; HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR13]] {
 ; HSA-NEXT:    [[STOF:%.*]] = addrspacecast ptr addrspace(4) [[PTR]] to ptr
 ; HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr [[STOF]], align 4
 ; HSA-NEXT:    ret void
@@ -503,7 +503,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %
 
 define amdgpu_kernel void @use_flat_to_global_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_global_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1)
 ; HSA-NEXT:    store volatile i32 0, ptr addrspace(1) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -515,7 +515,7 @@ define amdgpu_kernel void @use_flat_to_global_addrspacecast(ptr %ptr) #1 {
 
 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_constant_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(4)
 ; HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -534,7 +534,7 @@ define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_shared
-; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR14:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]])
 ; ATTRIBUTOR_HSA-NEXT:    [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32
 ; ATTRIBUTOR_HSA-NEXT:    store i32 [[EXT]], ptr addrspace(1) undef, align 4
@@ -555,7 +555,7 @@ define amdgpu_kernel void @use_is_private(ptr %ptr) #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_private
-; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR14]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]])
 ; ATTRIBUTOR_HSA-NEXT:    [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32
 ; ATTRIBUTOR_HSA-NEXT:    store i32 [[EXT]], ptr addrspace(1) undef, align 4
@@ -575,7 +575,7 @@ define amdgpu_kernel void @use_alloca() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR13]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
 ; ATTRIBUTOR_HSA-NEXT:    store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -596,7 +596,7 @@ define amdgpu_kernel void @use_alloca_non_entry_block() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR13]] {
 ; ATTRIBUTOR_HSA-NEXT:  entry:
 ; ATTRIBUTOR_HSA-NEXT:    br label [[BB:%.*]]
 ; ATTRIBUTOR_HSA:       bb:
@@ -621,7 +621,7 @@ define void @use_alloca_func() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR13:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
 ; ATTRIBUTOR_HSA-NEXT:    store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -643,19 +643,21 @@ attributes #1 = { nounwind }
 ; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-stack-objects" }
 ;.
 ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index 547ff69592ca0..89fe46d975309 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -398,13 +398,13 @@ attributes #1 = { nounwind }
 ; AKF_CHECK: attributes #[[ATTR1]] = { nounwind }
 ;.
 ; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
new file mode 100644
index 0000000000000..ce5a3eedb5ebb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
@@ -0,0 +1,1028 @@
+; Test the generation of the attribute amdgpu-no-flat-scratch-init
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -global-isel -stop-after=irtranslator < %s | FileCheck -check-prefixes=GFX10 %s
+
+;; tests of alloca
+
+define void @without_alloca(i1 %arg0) {
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @with_alloca() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_cc_kernel() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+; graphics functions won't get the attribute amdgpu-no-flat-scratch-init
+
+define amdgpu_vs void @with_alloca_cc_vs() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_gs void @with_alloca_cc_gs() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_ps void @with_alloca_cc_ps() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs void @with_alloca_cc_cs() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_hs void @with_alloca_cc_hs() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_ls void @with_alloca_cc_ls() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_es void @with_alloca_cc_es() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_gfx void @with_alloca_cc_gfx() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @with_alloca_cc_cs_chain() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define void @call_without_alloca() {
+  call void @without_alloca(i1 true)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_cc_kernel() {
+  call void @without_alloca(i1 true)
+  ret void
+}
+
+define void @call_with_alloca() {
+  call void @with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_with_alloca_cc_kernel() {
+  call void @with_alloca()
+  ret void
+}
+
+define void @call_both_with_and_without_alloca() {
+  call void @with_alloca()
+  call void @without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
+  call void @with_alloca()
+  call void @without_alloca()
+  ret void
+}
+
+define void @call_call_without_alloca() {
+  call void @call_without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
+  call void @call_without_alloca()
+  ret void
+}
+
+define void @call_call_with_alloca() {
+  call void @call_with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
+  call void @call_with_alloca()
+  ret void
+}
+
+define void @with_alloca_call_without_alloca() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @without_alloca()
+  ret void
+}
+
+define void @with_alloca_call_with_alloca() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @with_alloca()
+  ret void
+}
+
+define void @with_alloca_call_call_without_alloca() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_without_alloca()
+  ret void
+}
+
+define void @with_alloca_call_call_with_alloca() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_with_alloca()
+  ret void
+}
+
+;; tests of addrspacecast
+
+define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
+  store volatile i32 0, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
+  store volatile i32 0, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
+  %stof = addrspacecast ptr addrspace(1) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
+  %stof = addrspacecast ptr addrspace(1) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
+  store volatile i32 0, ptr addrspace(2) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
+  store volatile i32 0, ptr addrspace(2) %ptr
+  ret void
+}
+
+define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
+  %stof = addrspacecast ptr addrspace(2) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
+  %stof = addrspacecast ptr addrspace(2) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
+  store volatile i32 0, ptr addrspace(3) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
+  store volatile i32 0, ptr addrspace(3) %ptr
+  ret void
+}
+
+define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
+  %stof = addrspacecast ptr addrspace(3) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
+  %stof = addrspacecast ptr addrspace(3) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
+  store volatile i32 0, ptr addrspace(4) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
+  store volatile i32 0, ptr addrspace(4) %ptr
+  ret void
+}
+
+define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
+  %stof = addrspacecast ptr addrspace(4) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
+  %stof = addrspacecast ptr addrspace(4) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  store volatile i32 0, ptr addrspace(5) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  store volatile i32 0, ptr addrspace(5) %ptr
+  ret void
+}
+
+define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+;; tests of mixed alloca and addrspacecast
+
+define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @without_alloca(i1 true)
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @without_alloca(i1 true)
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @without_alloca(i1 true)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @without_alloca(i1 true)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+;; tests of indirect call, intrinsics
+
+ at gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
+
+define void @with_indirect_call() {
+  %fptr = load ptr, ptr addrspace(4) @gv.fptr0
+  call void %fptr()
+  ret void
+}
+
+define amdgpu_kernel void @with_indirect_call_cc_kernel() {
+  %fptr = load ptr, ptr addrspace(4) @gv.fptr0
+  call void %fptr()
+  ret void
+}
+
+define void @call_with_indirect_call() {
+  call void @with_indirect_call()
+  ret void
+}
+
+define amdgpu_kernel void @call_with_indirect_call_cc_kernel() {
+  call void @with_indirect_call()
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+
+define void @use_intrinsic_workitem_id_x() {
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define void @call_use_intrinsic_workitem_id_x() {
+  call void @use_intrinsic_workitem_id_x()
+  ret void
+}
+
+define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
+  call void @use_intrinsic_workitem_id_x()
+  ret void
+}
+
+; GFX10: name:            without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_cc_vs
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+;
+; GFX10: name:            with_alloca_cc_gs
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr5' }
+;
+; GFX10: name:            with_alloca_cc_ps
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+;
+; GFX10: name:            with_alloca_cc_cs
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+;
+; GFX10: name:            with_alloca_cc_hs
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr5' }
+;
+; GFX10: name:            with_alloca_cc_ls
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+;
+; GFX10: name:            with_alloca_cc_es
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+;
+; GFX10: name:            with_alloca_cc_gfx
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:  psInputAddr:     0
+;
+; GFX10: name:            with_alloca_cc_cs_chain
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr48_sgpr49_sgpr50_sgpr51' }
+; GFX10-NEXT:  psInputAddr:     0
+;
+; GFX10: name:            with_alloca_cc_cs_chain_preserve
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr48_sgpr49_sgpr50_sgpr51' }
+; GFX10-NEXT:  psInputAddr:     0
+;
+; GFX10: name:            call_without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
+;
+; GFX10: name:            call_with_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_with_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            call_both_with_and_without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_both_with_and_without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            call_call_without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_call_without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
+;
+; GFX10: name:            call_call_with_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_call_with_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_call_without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_alloca_call_without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_call_with_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_alloca_call_with_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_call_call_without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_alloca_call_call_without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_call_call_with_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_alloca_call_call_with_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            without_global_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_global_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_global_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_global_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            without_region_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_region_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_region_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_region_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            without_group_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_group_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_group_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_group_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            without_constant_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_constant_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_constant_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_constant_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            call_without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            call_with_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_with_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            call_both_with_and_without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            call_call_without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_call_without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            call_call_with_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_call_with_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            call_call_both_with_and_without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            with_cast_call_without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_cast_call_without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            with_cast_call_with_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_cast_call_with_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            call_without_alloca_and_without_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_without_alloca_and_without_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            call_without_alloca_and_with_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_without_alloca_and_with_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            with_indirect_call
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_indirect_call_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr8_sgpr9' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_with_indirect_call
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_with_indirect_call_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr8_sgpr9' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            use_intrinsic_workitem_id_x
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            use_intrinsic_workitem_id_x_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
+;
+; GFX10: name:            call_use_intrinsic_workitem_id_x
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_use_intrinsic_workitem_id_x_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
new file mode 100644
index 0000000000000..c0d700cc37464
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -0,0 +1,914 @@
+; Test the generation of the attribute amdgpu-no-flat-scratch-init
+; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -stop-after=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s
+
+;; tests of alloca
+
+define void @without_alloca(i1 %arg0) {
+; GFX9-LABEL: define void @without_alloca(i1 %arg0)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_alloca(i1 %arg0)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @with_alloca() {
+; GFX9-LABEL: define void @with_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
+; GFX9-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+; graphics functions won't get the attribute amdgpu-no-flat-scratch-init
+
+define amdgpu_vs void @with_alloca_cc_vs() {
+; GFX9-LABEL: define amdgpu_vs void @with_alloca_cc_vs()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_vs void @with_alloca_cc_vs()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_gs void @with_alloca_cc_gs() {
+; GFX9-LABEL: define amdgpu_gs void @with_alloca_cc_gs()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_gs void @with_alloca_cc_gs()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_ps void @with_alloca_cc_ps() {
+; GFX9-LABEL: define amdgpu_ps void @with_alloca_cc_ps()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_ps void @with_alloca_cc_ps()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs void @with_alloca_cc_cs() {
+; GFX9-LABEL: define amdgpu_cs void @with_alloca_cc_cs()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_cs void @with_alloca_cc_cs()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_hs void @with_alloca_cc_hs() {
+; GFX9-LABEL: define amdgpu_hs void @with_alloca_cc_hs()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_hs void @with_alloca_cc_hs()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_ls void @with_alloca_cc_ls() {
+; GFX9-LABEL: define amdgpu_ls void @with_alloca_cc_ls()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_ls void @with_alloca_cc_ls()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_es void @with_alloca_cc_es() {
+; GFX9-LABEL: define amdgpu_es void @with_alloca_cc_es()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_es void @with_alloca_cc_es()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_gfx void @with_alloca_cc_gfx() {
+; GFX9-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @with_alloca_cc_cs_chain() {
+; GFX9-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
+; GFX9-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define void @call_without_alloca() {
+; GFX9-LABEL: define void @call_without_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_without_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  call void @without_alloca(i1 true)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  call void @without_alloca(i1 true)
+  ret void
+}
+
+define void @call_with_alloca() {
+; GFX9-LABEL: define void @call_with_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_with_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_with_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @with_alloca()
+  ret void
+}
+
+define void @call_both_with_and_without_alloca() {
+; GFX9-LABEL: define void @call_both_with_and_without_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_both_with_and_without_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @with_alloca()
+  call void @without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @with_alloca()
+  call void @without_alloca()
+  ret void
+}
+
+define void @call_call_without_alloca() {
+; GFX9-LABEL: define void @call_call_without_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_call_without_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  call void @call_without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+  call void @call_without_alloca()
+  ret void
+}
+
+define void @call_call_with_alloca() {
+; GFX9-LABEL: define void @call_call_with_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_call_with_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @call_with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @call_with_alloca()
+  ret void
+}
+
+define void @with_alloca_call_without_alloca() {
+; GFX9-LABEL: define void @with_alloca_call_without_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_alloca_call_without_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @without_alloca()
+  ret void
+}
+
+define void @with_alloca_call_with_alloca() {
+; GFX9-LABEL: define void @with_alloca_call_with_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_alloca_call_with_alloca()
+; GFX10-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @with_alloca()
+  ret void
+}
+
+define void @with_alloca_call_call_without_alloca() {
+; GFX9-LABEL: define void @with_alloca_call_call_without_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_alloca_call_call_without_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_without_alloca()
+  ret void
+}
+
+define void @with_alloca_call_call_with_alloca() {
+; GFX9-LABEL: define void @with_alloca_call_call_with_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_alloca_call_call_with_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_with_alloca()
+  ret void
+}
+
+;; tests of addrspacecast
+
+define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i32 0, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i32 0, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(1) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(1) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
+; GFX9-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i32 0, ptr addrspace(2) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i32 0, ptr addrspace(2) %ptr
+  ret void
+}
+
+define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
+; GFX9-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(2) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(2) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i32 0, ptr addrspace(3) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i32 0, ptr addrspace(3) %ptr
+  ret void
+}
+
+define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(3) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(3) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
+; GFX9-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i32 0, ptr addrspace(4) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i32 0, ptr addrspace(4) %ptr
+  ret void
+}
+
+define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
+; GFX9-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(4) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(4) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i32 0, ptr addrspace(5) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i32 0, ptr addrspace(5) %ptr
+  ret void
+}
+
+define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+;; tests of mixed alloca and addrspacecast
+
+define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  call void @without_alloca(i1 true)
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  call void @without_alloca(i1 true)
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @without_alloca(i1 true)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @without_alloca(i1 true)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+;; tests of indirect call, intrinsics
+
+ at gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
+
+define void @with_indirect_call() {
+; GFX9-LABEL: define void @with_indirect_call()
+; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_indirect_call()
+; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL:[0-9]+]] {
+  %fptr = load ptr, ptr addrspace(4) @gv.fptr0
+  call void %fptr()
+  ret void
+}
+
+define amdgpu_kernel void @with_indirect_call_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL2:[0-9]+]]
+  %fptr = load ptr, ptr addrspace(4) @gv.fptr0
+  call void %fptr()
+  ret void
+}
+
+define void @call_with_indirect_call() {
+; GFX9-LABEL: define void @call_with_indirect_call()
+; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_with_indirect_call()
+; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL:[0-9]+]]
+  call void @with_indirect_call()
+  ret void
+}
+
+define amdgpu_kernel void @call_with_indirect_call_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL2:[0-9]+]]
+  call void @with_indirect_call()
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+
+define void @use_intrinsic_workitem_id_x() {
+; GFX9-LABEL: define void @use_intrinsic_workitem_id_x()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+;
+; GFX10-LABEL: define void @use_intrinsic_workitem_id_x()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define void @call_use_intrinsic_workitem_id_x() {
+; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+  call void @use_intrinsic_workitem_id_x()
+  ret void
+}
+
+define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+  call void @use_intrinsic_workitem_id_x()
+  ret void
+}
+
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+;GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_IND_CALL]]  = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
index b2f01660201d7..7f5432823c695 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
@@ -23,7 +23,7 @@
 ; COV5: .amdhsa_user_sgpr_queue_ptr 0
 ; NOOPT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; NOOPT: .amdhsa_user_sgpr_dispatch_id 1
-; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0
+; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 1
 ; NOOPT: .amdhsa_user_sgpr_private_segment_size 0
 ; NOOPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
 ; NOOPT: .amdhsa_system_sgpr_workgroup_id_x 1
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index ed418070ecb50..df8426c3c376a 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -33,8 +33,8 @@ define void @indirect_use_vcc() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
-; CI: ; NumSgprs: 38
-; VI-NOBUG: ; NumSgprs: 40
+; CI: ; NumSgprs: 36
+; VI-NOBUG: ; NumSgprs: 36
 ; VI-BUG: ; NumSgprs: 96
 ; GCN: ; NumVgprs: 41
 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
@@ -121,8 +121,8 @@ define void @indirect_use_80_sgpr() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
-; CI: ; NumSgprs: 84
-; VI-NOBUG: ; NumSgprs: 86
+; CI: ; NumSgprs: 82
+; VI-NOBUG: ; NumSgprs: 82
 ; VI-BUG: ; NumSgprs: 96
 define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
   call void @indirect_use_80_sgpr()
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index b711542be5a7f..6dc0034d2717d 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -196,11 +196,11 @@ define hidden void @use_workgroup_id_yz() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x:
 ; GCN-NOT: s6
-; GCN: s_mov_b32 s12, s6
+; GCN: s_mov_b32 s12, s4
 ; GCN: s_mov_b32 s32, 0
-; GCN: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, use_workgroup_id_x at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, use_workgroup_id_x at rel32@hi+12
+; GCN: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x at rel32@hi+12
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_endpgm
 
@@ -214,7 +214,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y:
 ; GCN-NOT: s12
-; GCN: s_mov_b32 s13, s7
+; GCN: s_mov_b32 s13, s5
 ; GCN-NOT: s12
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -230,7 +230,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 {
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z:
 ; GCN-NOT: s12
 ; GCN-NOT: s13
-; GCN: s_mov_b32 s14, s7
+; GCN: s_mov_b32 s14, s5
 ; GCN-NOT: s12
 ; GCN-NOT: s13
 
@@ -247,8 +247,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy:
 ; GCN-NOT: s14
-; GCN: s_mov_b32 s12, s6
-; GCN-NEXT: s_mov_b32 s13, s7
+; GCN: s_mov_b32 s12, s4
+; GCN-NEXT: s_mov_b32 s13, s5
 ; GCN-NOT: s14
 
 ; GCN: s_mov_b32 s32, 0
@@ -263,9 +263,9 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 {
 }
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz:
-; GCN: s_mov_b32 s12, s6
-; GCN: s_mov_b32 s13, s7
-; GCN: s_mov_b32 s14, s8
+; GCN: s_mov_b32 s12, s4
+; GCN: s_mov_b32 s13, s5
+; GCN: s_mov_b32 s14, s6
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 
@@ -280,8 +280,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 {
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz:
 
 ; GCN-NOT: s13
-; GCN: s_mov_b32 s12, s6
-; GCN-NEXT: s_mov_b32 s14, s7
+; GCN: s_mov_b32 s12, s4
+; GCN-NEXT: s_mov_b32 s14, s5
 ; GCN-NOT: s13
 
 ; GCN: s_mov_b32 s32, 0
@@ -297,8 +297,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz:
 
-; GCN: s_mov_b32 s13, s7
-; GCN: s_mov_b32 s14, s8
+; GCN: s_mov_b32 s13, s5
+; GCN: s_mov_b32 s14, s6
 
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -379,7 +379,7 @@ define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
 
 ; GCN-NOT: s13
 ; GCN-NOT: s14
-; GCN-DAG: s_mov_b32 s12, s6
+; GCN-DAG: s_mov_b32 s12, s4
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
 ; GCN-NOT: s13
 ; GCN-NOT: s14
@@ -397,7 +397,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y:
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: s_mov_b32 s13, s7
+; GCN-DAG: s_mov_b32 s13, s5
 
 ; GCN-DAG: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -412,7 +412,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z:
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: s_mov_b32 s14, s7
+; GCN-DAG: s_mov_b32 s14, s5
 
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
index 386f9cd3f9ce7..5a763dd87c8c1 100644
--- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
@@ -35,6 +35,6 @@ define amdgpu_kernel void @test_direct_indirect_call() {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index 05558c555c581..4e3346a5bf24c 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -42,6 +42,6 @@ attributes #0 = { "amdgpu-no-dispatch-id" }
 ;.
 ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" }
 ;.
-; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
index 0ad53083d0ff3..eb3d356ea59b4 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -20,7 +20,7 @@ define amdgpu_kernel void @store_flat_i32(ptr addrspace(1) %gptr, i32 %x) #0 {
 }
 
 ; GCN-LABEL: {{^}}store_flat_i64:
-; GCN: flat_store_{{dwordx2|b64}}
+; GCN: flat_store_{{dword|b64}}
 define amdgpu_kernel void @store_flat_i64(ptr addrspace(1) %gptr, i64 %x) #0 {
   %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
   store volatile i64 %x, ptr %fptr, align 8
@@ -28,7 +28,7 @@ define amdgpu_kernel void @store_flat_i64(ptr addrspace(1) %gptr, i64 %x) #0 {
 }
 
 ; GCN-LABEL: {{^}}store_flat_v4i32:
-; GCN: flat_store_{{dwordx4|b128}}
+; GCN: flat_store_{{dword|b128}}
 define amdgpu_kernel void @store_flat_v4i32(ptr addrspace(1) %gptr, <4 x i32> %x) #0 {
   %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
   store volatile <4 x i32> %x, ptr %fptr, align 16
@@ -65,7 +65,7 @@ define amdgpu_kernel void @load_flat_i32(ptr addrspace(1) noalias %out, ptr addr
 }
 
 ; GCN-LABEL: load_flat_i64:
-; GCN: flat_load_{{dwordx2|b64}}
+; GCN: flat_load_{{dword|b64}}
 define amdgpu_kernel void @load_flat_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
   %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
   %fload = load volatile i64, ptr %fptr, align 8
@@ -74,7 +74,7 @@ define amdgpu_kernel void @load_flat_i64(ptr addrspace(1) noalias %out, ptr addr
 }
 
 ; GCN-LABEL: load_flat_v4i32:
-; GCN: flat_load_{{dwordx4|b128}}
+; GCN: flat_load_{{dword|b128}}
 define amdgpu_kernel void @load_flat_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
   %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
   %fload = load volatile <4 x i32>, ptr %fptr, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 72f10ea892e53..6d2182c4eb321 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -12,7 +12,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x40
-; GFX8V4-NEXT:    v_mov_b32_e32 v4, 1
+; GFX8V4-NEXT:    s_add_i32 s8, s8, s11
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s9
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V4-NEXT:    s_cselect_b32 s3, s3, 0
@@ -22,6 +24,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8V4-NEXT:    s_cselect_b32 s0, s2, 0
 ; GFX8V4-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX8V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v4
@@ -35,7 +38,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xc8
-; GFX8V5-NEXT:    v_mov_b32_e32 v4, 1
+; GFX8V5-NEXT:    s_add_i32 s6, s6, s9
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V5-NEXT:    s_cselect_b32 s2, s2, 0
@@ -45,6 +50,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8V5-NEXT:    s_cselect_b32 s0, s3, 0
 ; GFX8V5-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX8V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v4
@@ -57,9 +63,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX9V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX9V4-NEXT:    s_cselect_b32 s2, s3, 0
@@ -69,6 +76,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9V4-NEXT:    s_cselect_b32 s0, s5, 0
 ; GFX9V4-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v4
@@ -81,9 +89,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
 ; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX9V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX9V5-NEXT:    s_cselect_b32 s2, s3, 0
@@ -93,6 +102,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9V5-NEXT:    s_cselect_b32 s0, s5, 0
 ; GFX9V5-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v4
@@ -101,6 +111,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-NEXT:    flat_store_dword v[2:3], v0
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    s_endpgm
+
   %flat.private = addrspacecast ptr addrspace(5) %ptr.private to ptr
   %flat.local = addrspacecast ptr addrspace(3) %ptr.local to ptr
   store volatile i32 1, ptr %flat.private
diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
index 4c21f87297455..5979f1bbacdd1 100644
--- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
@@ -258,25 +258,25 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo
 
 ;.
 ; V4: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V4: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
 ;.
 ; V5: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V5: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
 ;.
 ; V6: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V6: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
 ;.
 ; V4: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index 6c8646968b676..875f7bf75a3cf 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -30,7 +30,7 @@ define hidden void @func() #1 {
 ; GCN-NOT: writelane
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 
-; GCN: ; NumSgprs: 37
+; GCN: ; NumSgprs: 33
 ; GCN: ; NumVgprs: 9
 define amdgpu_kernel void @kernel_call() #0 {
   %vgpr = load volatile i32, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 9619cb73b1538..1dcc34c7ad87c 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -113,18 +113,14 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) {
 define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s6, s6, s9
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -172,18 +168,14 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) {
 define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s6, s6, s9
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -231,18 +223,14 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) {
 define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s6, s6, s9
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -290,18 +278,14 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx)
 define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s6, s6, s9
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
 ; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
@@ -336,12 +320,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s6, s6, s9
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
@@ -352,6 +331,7 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
 ; CHECK-NEXT:    s_mov_b32 s15, 0
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -365,18 +345,14 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s6, s6, s9
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -405,12 +381,7 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id
 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s6, s6, s9
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
@@ -421,6 +392,7 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
 ; CHECK-NEXT:    s_mov_b32 s15, 2
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -434,18 +406,14 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s6, s6, s9
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -474,12 +442,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32
 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s6, s6, s9
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
@@ -490,6 +453,7 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
 ; CHECK-NEXT:    s_mov_b32 s15, 1
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -503,18 +467,14 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s6, s6, s9
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -543,12 +503,7 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32
 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s6, s6, s9
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
@@ -559,6 +514,7 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 2
 ; CHECK-NEXT:    s_mov_b32 s15, 3
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
@@ -572,18 +528,14 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s6, s6, s9
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s7, s7, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s6
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s7
-; CHECK-NEXT:    s_add_u32 s0, s0, s9
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_add_u32 s8, s4, 8
 ; CHECK-NEXT:    s_addc_u32 s9, s5, 0
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
 ; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 61818dafd2b84..b88c3b16fab82 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -42,22 +42,19 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
 ; GCN-LABEL: indirect_lds_id:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; GCN-NEXT:    s_add_i32 s6, s6, s9
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_getpc_b64 s[6:7]
-; GCN-NEXT:    s_add_u32 s6, s6, function_lds_id at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s7, s7, function_lds_id at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GCN-NEXT:    s_getpc_b64 s[8:9]
+; GCN-NEXT:    s_add_u32 s8, s8, function_lds_id at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s9, s9, function_lds_id at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
 ; GCN-NEXT:    s_mov_b32 s15, 21
-; GCN-NEXT:    s_mov_b32 s12, s8
+; GCN-NEXT:    s_mov_b32 s12, s6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GCN-NEXT:    s_endpgm
   call void @function_lds_id(ptr addrspace(1) %out)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index 00d01a080ad14..558f046b0b2d9 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -161,10 +161,7 @@ define amdgpu_kernel void @k01() {
 ; GCN-LABEL: k01:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; GCN-NEXT:    s_add_i32 s6, s6, s9
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
@@ -195,10 +192,7 @@ define amdgpu_kernel void @k23() {
 ; GCN-LABEL: k23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; GCN-NEXT:    s_add_i32 s6, s6, s9
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
@@ -237,10 +231,7 @@ define amdgpu_kernel void @k123() {
 ; GCN-LABEL: k123:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; GCN-NEXT:    s_add_i32 s6, s6, s9
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index d3cc60c501fd7..b97c79be6dbed 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -226,10 +226,7 @@ define amdgpu_kernel void @k01() {
 ; GCN-LABEL: k01:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; GCN-NEXT:    s_add_i32 s6, s6, s9
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
@@ -265,10 +262,7 @@ define amdgpu_kernel void @k23() {
 ; GCN-LABEL: k23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; GCN-NEXT:    s_add_i32 s6, s6, s9
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
@@ -307,10 +301,7 @@ define amdgpu_kernel void @k123() {
 ; GCN-LABEL: k123:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; GCN-NEXT:    s_add_i32 s6, s6, s9
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s9
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 193f642cb1fa1..51a66e8e94220 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_agent_unordered_load(
 ; GFX7-LABEL: flat_agent_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -200,6 +215,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_load(
 ; GFX7-LABEL: flat_agent_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -215,6 +233,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -231,6 +253,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -262,6 +288,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -276,6 +304,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -385,6 +415,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX7-LABEL: flat_agent_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -401,6 +434,10 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -419,6 +456,10 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -452,6 +493,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -467,6 +510,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -589,6 +634,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX7-LABEL: flat_agent_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -606,6 +654,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -626,6 +678,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -662,6 +718,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -678,6 +736,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -815,6 +875,9 @@ entry:
 define amdgpu_kernel void @flat_agent_unordered_store(
 ; GFX7-LABEL: flat_agent_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -826,6 +889,10 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -838,6 +905,10 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -861,6 +932,8 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -872,6 +945,8 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -955,6 +1030,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_store(
 ; GFX7-LABEL: flat_agent_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -966,6 +1044,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -978,6 +1060,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1001,6 +1087,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1012,6 +1100,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1095,6 +1185,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_store(
 ; GFX7-LABEL: flat_agent_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1107,6 +1200,10 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1121,6 +1218,10 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1147,6 +1248,8 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1159,6 +1262,8 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1259,6 +1364,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_store(
 ; GFX7-LABEL: flat_agent_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1271,6 +1379,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1285,6 +1397,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1311,6 +1427,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1323,6 +1441,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1423,6 +1543,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1435,6 +1558,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1448,6 +1575,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1473,6 +1604,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1485,6 +1618,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1575,6 +1710,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX7-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1589,6 +1727,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1606,6 +1748,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1636,6 +1782,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1650,6 +1798,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1758,6 +1908,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ; GFX7-LABEL: flat_agent_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1771,6 +1924,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1786,6 +1943,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1814,6 +1975,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1827,6 +1990,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1934,6 +2099,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1949,6 +2117,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1968,6 +2140,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2001,6 +2177,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2016,6 +2194,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2141,6 +2321,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2156,6 +2339,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2175,6 +2362,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2208,6 +2399,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2223,6 +2416,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2348,6 +2543,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2364,6 +2562,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2382,6 +2584,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2415,6 +2621,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2430,6 +2638,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2552,6 +2762,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2569,6 +2782,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2589,6 +2806,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2625,6 +2846,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2641,6 +2864,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2780,6 +3005,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2797,6 +3025,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2817,6 +3049,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2853,6 +3089,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2869,6 +3107,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3008,6 +3248,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3033,6 +3276,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3058,6 +3305,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3108,6 +3359,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3123,6 +3376,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3239,6 +3494,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3266,6 +3524,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3295,6 +3557,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3350,6 +3616,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3367,6 +3635,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3501,6 +3771,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3527,6 +3800,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3554,6 +3831,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3607,6 +3888,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3623,6 +3906,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3756,6 +4041,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3784,6 +4072,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3815,6 +4107,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3873,6 +4169,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3891,6 +4189,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4042,6 +4342,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4070,6 +4373,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4101,6 +4408,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4159,6 +4470,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4177,6 +4490,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4328,6 +4643,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4355,6 +4673,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4384,6 +4706,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4439,6 +4765,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4456,6 +4784,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4590,6 +4920,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4617,6 +4950,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4646,6 +4983,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4701,6 +5042,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4718,6 +5061,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4852,6 +5197,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4880,6 +5228,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4911,6 +5263,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4969,6 +5325,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4987,6 +5345,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5138,6 +5498,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5166,6 +5529,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5197,6 +5564,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5255,6 +5626,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5273,6 +5646,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5424,6 +5799,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5452,6 +5830,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5483,6 +5865,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5541,6 +5927,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5559,6 +5947,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5710,6 +6100,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5738,6 +6131,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5769,6 +6166,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5827,6 +6228,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5845,6 +6248,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5996,6 +6401,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6024,6 +6432,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6055,6 +6467,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6113,6 +6529,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6131,6 +6549,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6282,6 +6702,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6310,6 +6733,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6341,6 +6768,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6399,6 +6830,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6417,6 +6850,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6568,6 +7003,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6596,6 +7034,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6627,6 +7069,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6685,6 +7131,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6703,6 +7151,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6854,6 +7304,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6882,6 +7335,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6913,6 +7370,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6971,6 +7432,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6989,6 +7452,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7140,6 +7605,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7168,6 +7636,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7196,6 +7668,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7252,6 +7728,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7270,6 +7748,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7413,6 +7893,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7442,6 +7925,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7472,6 +7959,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7530,6 +8021,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7549,6 +8042,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7705,6 +8200,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7734,6 +8232,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7764,6 +8266,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7823,6 +8329,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7842,6 +8350,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8002,6 +8512,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8032,6 +8545,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8064,6 +8581,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8125,6 +8646,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8145,6 +8668,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8318,6 +8843,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8348,6 +8876,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8380,6 +8912,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8441,6 +8977,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8461,6 +8999,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8634,6 +9174,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8663,6 +9206,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8693,6 +9240,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8751,6 +9302,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8770,6 +9323,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8926,6 +9481,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8955,6 +9513,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8985,6 +9547,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9043,6 +9609,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9062,6 +9630,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9218,6 +9788,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9248,6 +9821,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9280,6 +9857,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9341,6 +9922,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9361,6 +9944,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9534,6 +10119,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9564,6 +10152,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9596,6 +10188,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9657,6 +10253,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9677,6 +10275,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9850,6 +10450,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9880,6 +10483,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9912,6 +10519,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9973,6 +10584,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9993,6 +10606,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10166,6 +10781,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10196,6 +10814,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10228,6 +10850,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10289,6 +10915,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10309,6 +10937,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10482,6 +11112,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10512,6 +11145,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10544,6 +11181,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10605,6 +11246,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10625,6 +11268,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10798,6 +11443,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10828,6 +11476,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10860,6 +11512,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10921,6 +11577,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10941,6 +11599,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11114,6 +11774,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11144,6 +11807,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11176,6 +11843,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11237,6 +11908,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11257,6 +11930,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11430,6 +12105,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11460,6 +12138,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11492,6 +12174,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11553,6 +12239,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11573,6 +12261,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11746,6 +12436,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ; GFX7-LABEL: flat_agent_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11761,6 +12454,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11777,6 +12474,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11808,6 +12509,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11822,6 +12525,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11931,6 +12636,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11946,6 +12654,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11962,6 +12674,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11993,6 +12709,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12007,6 +12725,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12116,6 +12836,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX7-LABEL: flat_agent_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12133,6 +12856,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12152,6 +12879,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12187,6 +12918,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12203,6 +12936,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12330,6 +13065,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12348,6 +13086,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12369,6 +13111,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12407,6 +13153,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12424,6 +13172,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12566,6 +13316,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ; GFX7-LABEL: flat_agent_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12577,6 +13330,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12589,6 +13346,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12612,6 +13373,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12623,6 +13386,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12706,6 +13471,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12717,6 +13485,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12729,6 +13501,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12752,6 +13528,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12763,6 +13541,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12846,6 +13626,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_store(
 ; GFX7-LABEL: flat_agent_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12858,6 +13641,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12872,6 +13659,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12898,6 +13689,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12910,6 +13703,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13010,6 +13805,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13022,6 +13820,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13036,6 +13838,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13062,6 +13868,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13074,6 +13882,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13174,6 +13984,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13186,6 +13999,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13199,6 +14016,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13224,6 +14045,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13236,6 +14059,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13326,6 +14151,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13340,6 +14168,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13356,6 +14188,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13385,6 +14221,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13399,6 +14237,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13505,6 +14345,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13518,6 +14361,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13533,6 +14380,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13561,6 +14412,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13574,6 +14427,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13681,6 +14536,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13696,6 +14554,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13714,6 +14576,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13746,6 +14612,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13761,6 +14629,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13884,6 +14754,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13899,6 +14772,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13917,6 +14794,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13949,6 +14830,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13964,6 +14847,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14087,6 +14972,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14104,6 +14992,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14123,6 +15015,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14158,6 +15054,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14174,6 +15072,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14301,6 +15201,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14319,6 +15222,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14340,6 +15247,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14378,6 +15289,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14395,6 +15308,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14539,6 +15454,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14557,6 +15475,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14578,6 +15500,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14616,6 +15542,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14633,6 +15561,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14777,6 +15707,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14802,6 +15735,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14827,6 +15764,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14877,6 +15818,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14892,6 +15835,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15008,6 +15953,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15035,6 +15983,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15063,6 +16015,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15117,6 +16073,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15134,6 +16092,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15266,6 +16226,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15292,6 +16255,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15319,6 +16286,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15372,6 +16343,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15388,6 +16361,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15521,6 +16496,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15549,6 +16527,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15579,6 +16561,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15636,6 +16622,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15654,6 +16642,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15803,6 +16793,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15831,6 +16824,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15861,6 +16858,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15918,6 +16919,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15936,6 +16939,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16085,6 +17090,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16112,6 +17120,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16140,6 +17152,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16194,6 +17210,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16211,6 +17229,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16343,6 +17363,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16370,6 +17393,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16398,6 +17425,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16452,6 +17483,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16469,6 +17502,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16601,6 +17636,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16629,6 +17667,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16659,6 +17701,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16716,6 +17762,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16734,6 +17782,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16883,6 +17933,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16911,6 +17964,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16941,6 +17998,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16998,6 +18059,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17016,6 +18079,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17165,6 +18230,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17193,6 +18261,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17223,6 +18295,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17280,6 +18356,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17298,6 +18376,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17447,6 +18527,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17475,6 +18558,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17505,6 +18592,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17562,6 +18653,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17580,6 +18673,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17729,6 +18824,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17757,6 +18855,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17787,6 +18889,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17844,6 +18950,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17862,6 +18970,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18011,6 +19121,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18039,6 +19152,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18069,6 +19186,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18126,6 +19247,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18144,6 +19267,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18293,6 +19418,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18321,6 +19449,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18351,6 +19483,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18408,6 +19544,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18426,6 +19564,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18575,6 +19715,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18603,6 +19746,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18633,6 +19780,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18690,6 +19841,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18708,6 +19861,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18857,6 +20012,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18885,6 +20043,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18913,6 +20075,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18969,6 +20135,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18987,6 +20155,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19130,6 +20300,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19160,6 +20333,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19191,6 +20368,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19251,6 +20432,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19271,6 +20454,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19432,6 +20617,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19461,6 +20649,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19491,6 +20683,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19550,6 +20746,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19569,6 +20767,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19729,6 +20929,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19760,6 +20963,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19793,6 +21000,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19856,6 +21067,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19877,6 +21090,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20055,6 +21270,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20086,6 +21304,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20119,6 +21341,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20182,6 +21408,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20203,6 +21431,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20381,6 +21611,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20411,6 +21644,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20442,6 +21679,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20502,6 +21743,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20522,6 +21765,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20683,6 +21928,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20713,6 +21961,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20744,6 +21996,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20804,6 +22060,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20824,6 +22082,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20985,6 +22245,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21016,6 +22279,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21049,6 +22316,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21112,6 +22383,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21133,6 +22406,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21311,6 +22586,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21342,6 +22620,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21375,6 +22657,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21438,6 +22724,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21459,6 +22747,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21637,6 +22927,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21668,6 +22961,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21701,6 +22998,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21764,6 +23065,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21785,6 +23088,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21963,6 +23268,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21994,6 +23302,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22027,6 +23339,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22090,6 +23406,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22111,6 +23429,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22289,6 +23609,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22320,6 +23643,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22353,6 +23680,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22416,6 +23747,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22437,6 +23770,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22615,6 +23950,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22646,6 +23984,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22679,6 +24021,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22742,6 +24088,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22763,6 +24111,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22941,6 +24291,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22972,6 +24325,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23005,6 +24362,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23068,6 +24429,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23089,6 +24452,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23267,6 +24632,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23298,6 +24666,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23331,6 +24703,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23394,6 +24770,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23415,6 +24793,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 5fa8e6891bafb..5a89e1269526b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX7-LABEL: flat_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -200,6 +215,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX7-LABEL: flat_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -228,6 +246,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
@@ -255,6 +277,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_mov_b32 s6, 2
@@ -310,6 +336,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
@@ -339,6 +367,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
@@ -539,6 +569,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX7-LABEL: flat_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -554,6 +587,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -570,6 +607,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -601,6 +642,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -615,6 +658,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -724,6 +769,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX7-LABEL: flat_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -752,6 +800,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -780,6 +832,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -836,6 +892,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -866,6 +924,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1067,6 +1127,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ; GFX7-LABEL: flat_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -1083,6 +1146,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1100,6 +1167,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1133,6 +1204,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1148,6 +1221,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 4c9ce15211e34..f39801b6b7ade 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_singlethread_unordered_load(
 ; GFX7-LABEL: flat_singlethread_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -200,6 +215,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ; GFX7-LABEL: flat_singlethread_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -215,6 +233,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -231,6 +253,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -262,6 +288,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -276,6 +304,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -385,6 +415,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_load(
 ; GFX7-LABEL: flat_singlethread_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -400,6 +433,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -416,6 +453,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -447,6 +488,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -461,6 +504,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -570,6 +615,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ; GFX7-LABEL: flat_singlethread_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -585,6 +633,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -601,6 +653,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -632,6 +688,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -646,6 +704,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -755,6 +815,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_unordered_store(
 ; GFX7-LABEL: flat_singlethread_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -766,6 +829,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -778,6 +845,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -801,6 +872,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -812,6 +885,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -895,6 +970,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ; GFX7-LABEL: flat_singlethread_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -906,6 +984,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -918,6 +1000,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -941,6 +1027,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -952,6 +1040,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1035,6 +1125,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_store(
 ; GFX7-LABEL: flat_singlethread_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1046,6 +1139,10 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1058,6 +1155,10 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1081,6 +1182,8 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1092,6 +1195,8 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1175,6 +1280,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ; GFX7-LABEL: flat_singlethread_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1186,6 +1294,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1198,6 +1310,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1221,6 +1337,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1232,6 +1350,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1315,6 +1435,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1327,6 +1450,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1340,6 +1467,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1365,6 +1496,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1377,6 +1510,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1467,6 +1602,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1479,6 +1617,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1492,6 +1634,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1517,6 +1663,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1529,6 +1677,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1619,6 +1769,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1631,6 +1784,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1644,6 +1801,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1669,6 +1830,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1681,6 +1844,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1771,6 +1936,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1783,6 +1951,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1796,6 +1968,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1821,6 +1997,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1833,6 +2011,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1923,6 +2103,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1935,6 +2118,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1948,6 +2135,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1973,6 +2164,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1985,6 +2178,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2075,6 +2270,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2090,6 +2288,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2106,6 +2308,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2137,6 +2343,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2151,6 +2359,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2260,6 +2470,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2275,6 +2488,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2291,6 +2508,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2322,6 +2543,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2336,6 +2559,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2445,6 +2670,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2460,6 +2688,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2476,6 +2708,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2507,6 +2743,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2521,6 +2759,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2630,6 +2870,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2655,6 +2898,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2680,6 +2927,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2730,6 +2981,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2745,6 +2998,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2861,6 +3116,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2886,6 +3144,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2911,6 +3173,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2961,6 +3227,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2976,6 +3244,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3092,6 +3362,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3117,6 +3390,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3142,6 +3419,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3192,6 +3473,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3207,6 +3490,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3323,6 +3608,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3348,6 +3636,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3373,6 +3665,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3423,6 +3719,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3438,6 +3736,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3554,6 +3854,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3579,6 +3882,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3604,6 +3911,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3654,6 +3965,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3669,6 +3982,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3785,6 +4100,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3810,6 +4128,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3835,6 +4157,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3885,6 +4211,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3900,6 +4228,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4016,6 +4346,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4041,6 +4374,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4066,6 +4403,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4116,6 +4457,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4131,6 +4474,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4247,6 +4592,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4272,6 +4620,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4297,6 +4649,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4347,6 +4703,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4362,6 +4720,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4478,6 +4838,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4503,6 +4866,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4528,6 +4895,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4578,6 +4949,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4593,6 +4966,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4709,6 +5084,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4734,6 +5112,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4759,6 +5141,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4809,6 +5195,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4824,6 +5212,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4940,6 +5330,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4965,6 +5358,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4990,6 +5387,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5040,6 +5441,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5055,6 +5458,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5171,6 +5576,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5196,6 +5604,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5221,6 +5633,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5271,6 +5687,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5286,6 +5704,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5402,6 +5822,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5427,6 +5850,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5452,6 +5879,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5502,6 +5933,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5517,6 +5950,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5633,6 +6068,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5658,6 +6096,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5683,6 +6125,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5733,6 +6179,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5748,6 +6196,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5864,6 +6314,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5889,6 +6342,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5914,6 +6371,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5964,6 +6425,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5979,6 +6442,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6095,6 +6560,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6123,6 +6591,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6151,6 +6623,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6207,6 +6683,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6225,6 +6703,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6368,6 +6848,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6396,6 +6879,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6424,6 +6911,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6480,6 +6971,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6498,6 +6991,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6641,6 +7136,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6669,6 +7167,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6697,6 +7199,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6753,6 +7259,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6771,6 +7279,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6914,6 +7424,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6942,6 +7455,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6970,6 +7487,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7026,6 +7547,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7044,6 +7567,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7187,6 +7712,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7215,6 +7743,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7243,6 +7775,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7299,6 +7835,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7317,6 +7855,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7460,6 +8000,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7488,6 +8031,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7516,6 +8063,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7572,6 +8123,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7590,6 +8143,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7733,6 +8288,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7761,6 +8319,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7789,6 +8351,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7845,6 +8411,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7863,6 +8431,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8006,6 +8576,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8034,6 +8607,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8062,6 +8639,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8118,6 +8699,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8136,6 +8719,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8279,6 +8864,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8307,6 +8895,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8335,6 +8927,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8391,6 +8987,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8409,6 +9007,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8552,6 +9152,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8580,6 +9183,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8608,6 +9215,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8664,6 +9275,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8682,6 +9295,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8825,6 +9440,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8853,6 +9471,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8881,6 +9503,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8937,6 +9563,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8955,6 +9583,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9098,6 +9728,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9126,6 +9759,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9154,6 +9791,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9210,6 +9851,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9228,6 +9871,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9371,6 +10016,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9399,6 +10047,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9427,6 +10079,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9483,6 +10139,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9501,6 +10159,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9644,6 +10304,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9672,6 +10335,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9700,6 +10367,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9756,6 +10427,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9774,6 +10447,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9917,6 +10592,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9945,6 +10623,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9973,6 +10655,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10029,6 +10715,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10047,6 +10735,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10190,6 +10880,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ; GFX7-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10205,6 +10898,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10221,6 +10918,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10252,6 +10953,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10266,6 +10969,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10375,6 +11080,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10390,6 +11098,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10406,6 +11118,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10437,6 +11153,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10451,6 +11169,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10560,6 +11280,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10575,6 +11298,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10591,6 +11318,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10622,6 +11353,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10636,6 +11369,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10745,6 +11480,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10760,6 +11498,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10776,6 +11518,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10807,6 +11553,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10821,6 +11569,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10930,6 +11680,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ; GFX7-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10941,6 +11694,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -10953,6 +11710,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -10976,6 +11737,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -10987,6 +11750,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11070,6 +11835,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11081,6 +11849,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11093,6 +11865,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11116,6 +11892,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11127,6 +11905,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11210,6 +11990,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ; GFX7-LABEL: flat_singlethread_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11221,6 +12004,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11233,6 +12020,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11256,6 +12047,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11267,6 +12060,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11350,6 +12145,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11361,6 +12159,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11373,6 +12175,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11396,6 +12202,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11407,6 +12215,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11490,6 +12300,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11502,6 +12315,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11515,6 +12332,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11540,6 +12361,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11552,6 +12375,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11642,6 +12467,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11654,6 +12482,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11667,6 +12499,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11692,6 +12528,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11704,6 +12542,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11794,6 +12634,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11806,6 +12649,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11819,6 +12666,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11844,6 +12695,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11856,6 +12709,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11946,6 +12801,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11958,6 +12816,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11971,6 +12833,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11996,6 +12862,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12008,6 +12876,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12098,6 +12968,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12110,6 +12983,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12123,6 +13000,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12148,6 +13029,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12160,6 +13043,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12250,6 +13135,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12265,6 +13153,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12281,6 +13173,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12312,6 +13208,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12326,6 +13224,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12435,6 +13335,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12450,6 +13353,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12466,6 +13373,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12497,6 +13408,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12511,6 +13424,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12620,6 +13535,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12635,6 +13553,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12651,6 +13573,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12682,6 +13608,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12696,6 +13624,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12805,6 +13735,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -12830,6 +13763,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -12855,6 +13792,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -12905,6 +13846,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12920,6 +13863,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13036,6 +13981,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13061,6 +14009,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13086,6 +14038,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13136,6 +14092,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13151,6 +14109,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13267,6 +14227,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13292,6 +14255,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13317,6 +14284,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13367,6 +14338,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13382,6 +14355,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13498,6 +14473,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13523,6 +14501,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13548,6 +14530,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13598,6 +14584,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13613,6 +14601,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13729,6 +14719,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13754,6 +14747,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13779,6 +14776,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13829,6 +14830,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13844,6 +14847,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13960,6 +14965,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13985,6 +14993,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14010,6 +15022,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14060,6 +15076,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14075,6 +15093,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14191,6 +15211,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14216,6 +15239,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14241,6 +15268,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14291,6 +15322,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14306,6 +15339,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14422,6 +15457,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14447,6 +15485,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14472,6 +15514,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14522,6 +15568,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14537,6 +15585,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14653,6 +15703,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14678,6 +15731,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14703,6 +15760,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14753,6 +15814,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14768,6 +15831,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14884,6 +15949,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14909,6 +15977,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14934,6 +16006,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14984,6 +16060,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14999,6 +16077,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15115,6 +16195,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15140,6 +16223,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15165,6 +16252,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15215,6 +16306,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15230,6 +16323,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15346,6 +16441,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15371,6 +16469,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15396,6 +16498,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15446,6 +16552,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15461,6 +16569,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15577,6 +16687,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15602,6 +16715,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15627,6 +16744,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15677,6 +16798,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15692,6 +16815,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15808,6 +16933,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15833,6 +16961,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15858,6 +16990,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15908,6 +17044,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15923,6 +17061,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16039,6 +17179,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16064,6 +17207,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16089,6 +17236,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16139,6 +17290,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16154,6 +17307,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16270,6 +17425,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16298,6 +17456,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16326,6 +17488,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16382,6 +17548,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16400,6 +17568,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16543,6 +17713,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16571,6 +17744,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16599,6 +17776,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16655,6 +17836,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16673,6 +17856,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16816,6 +18001,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16844,6 +18032,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16872,6 +18064,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16928,6 +18124,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16946,6 +18144,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17089,6 +18289,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17117,6 +18320,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17145,6 +18352,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17201,6 +18412,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17219,6 +18432,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17362,6 +18577,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17390,6 +18608,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17418,6 +18640,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17474,6 +18700,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17492,6 +18720,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17635,6 +18865,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17663,6 +18896,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17691,6 +18928,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17747,6 +18988,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17765,6 +19008,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17908,6 +19153,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17936,6 +19184,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17964,6 +19216,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18020,6 +19276,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18038,6 +19296,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18181,6 +19441,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18209,6 +19472,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18237,6 +19504,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18293,6 +19564,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18311,6 +19584,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18454,6 +19729,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18482,6 +19760,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18510,6 +19792,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18566,6 +19852,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18584,6 +19872,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18727,6 +20017,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18755,6 +20048,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18783,6 +20080,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18839,6 +20140,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18857,6 +20160,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19000,6 +20305,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19028,6 +20336,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19056,6 +20368,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19112,6 +20428,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19130,6 +20448,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19273,6 +20593,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19301,6 +20624,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19329,6 +20656,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19385,6 +20716,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19403,6 +20736,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19546,6 +20881,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19574,6 +20912,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19602,6 +20944,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19658,6 +21004,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19676,6 +21024,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19819,6 +21169,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19847,6 +21200,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19875,6 +21232,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19931,6 +21292,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19949,6 +21312,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20092,6 +21457,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20120,6 +21488,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20148,6 +21520,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20204,6 +21580,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20222,6 +21600,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index 3bb871467c230..6f2630f39484e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_system_unordered_load(
 ; GFX7-LABEL: flat_system_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_system_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -200,6 +215,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_load(
 ; GFX7-LABEL: flat_system_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -215,6 +233,10 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -231,6 +253,10 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -262,6 +288,8 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -276,6 +304,8 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -385,6 +415,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX7-LABEL: flat_system_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -401,6 +434,10 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -419,6 +456,10 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -452,6 +493,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -468,6 +511,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -591,6 +636,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX7-LABEL: flat_system_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -608,6 +656,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -628,6 +680,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -664,6 +720,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -681,6 +739,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -819,6 +879,9 @@ entry:
 define amdgpu_kernel void @flat_system_unordered_store(
 ; GFX7-LABEL: flat_system_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -830,6 +893,10 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -842,6 +909,10 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_system_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -865,6 +936,8 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -876,6 +949,8 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -959,6 +1034,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_store(
 ; GFX7-LABEL: flat_system_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -970,6 +1048,10 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -982,6 +1064,10 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1005,6 +1091,8 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1016,6 +1104,8 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1099,6 +1189,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_store(
 ; GFX7-LABEL: flat_system_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1111,6 +1204,10 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1125,6 +1222,10 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX10-CU-LABEL: flat_system_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1151,6 +1252,8 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1164,6 +1267,8 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1265,6 +1370,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_store(
 ; GFX7-LABEL: flat_system_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1277,6 +1385,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1291,6 +1403,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1317,6 +1433,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1330,6 +1448,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1431,6 +1551,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1443,6 +1566,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1456,6 +1583,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1481,6 +1612,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1493,6 +1626,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1583,6 +1718,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX7-LABEL: flat_system_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1597,6 +1735,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1614,6 +1756,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1644,6 +1790,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1659,6 +1807,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1768,6 +1918,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_atomicrmw(
 ; GFX7-LABEL: flat_system_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1781,6 +1934,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1796,6 +1953,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1824,6 +1985,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1838,6 +2001,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1946,6 +2111,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1961,6 +2129,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1980,6 +2152,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2013,6 +2189,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2030,6 +2208,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2157,6 +2337,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2172,6 +2355,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2191,6 +2378,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2224,6 +2415,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2241,6 +2434,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2368,6 +2563,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2384,6 +2582,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2402,6 +2604,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2435,6 +2641,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2451,6 +2659,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2574,6 +2784,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2591,6 +2804,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2611,6 +2828,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2647,6 +2868,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2665,6 +2888,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2806,6 +3031,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2823,6 +3051,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2843,6 +3075,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2879,6 +3115,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2897,6 +3135,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3038,6 +3278,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3063,6 +3306,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3088,6 +3335,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3138,6 +3389,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3153,6 +3406,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3269,6 +3524,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3296,6 +3554,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3325,6 +3587,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3380,6 +3646,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3398,6 +3666,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3533,6 +3803,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3559,6 +3832,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3586,6 +3863,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3639,6 +3920,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3656,6 +3939,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3790,6 +4075,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3818,6 +4106,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3849,6 +4141,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3907,6 +4203,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3927,6 +4225,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4080,6 +4380,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4108,6 +4411,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4139,6 +4446,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4197,6 +4508,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4217,6 +4530,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4370,6 +4685,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4397,6 +4715,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4426,6 +4748,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4481,6 +4807,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4499,6 +4827,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4634,6 +4964,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4661,6 +4994,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4690,6 +5027,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4745,6 +5086,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4763,6 +5106,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4898,6 +5243,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4926,6 +5274,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4957,6 +5309,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5015,6 +5371,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5035,6 +5393,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5188,6 +5548,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5216,6 +5579,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5247,6 +5614,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5305,6 +5676,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5325,6 +5698,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5478,6 +5853,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5506,6 +5884,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5537,6 +5919,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5595,6 +5981,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5615,6 +6003,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5768,6 +6158,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5796,6 +6189,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5827,6 +6224,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5885,6 +6286,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5905,6 +6308,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6058,6 +6463,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6086,6 +6494,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6117,6 +6529,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6175,6 +6591,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6195,6 +6613,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6348,6 +6768,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6376,6 +6799,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6407,6 +6834,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6465,6 +6896,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6485,6 +6918,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6638,6 +7073,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6666,6 +7104,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6697,6 +7139,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6755,6 +7201,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6775,6 +7223,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6928,6 +7378,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6956,6 +7409,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6987,6 +7444,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7045,6 +7506,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7065,6 +7528,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7218,6 +7683,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7246,6 +7714,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7274,6 +7746,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7330,6 +7806,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7348,6 +7826,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7491,6 +7971,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7520,6 +8003,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7550,6 +8037,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7608,6 +8099,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7628,6 +8121,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7785,6 +8280,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7814,6 +8312,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7844,6 +8346,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7903,6 +8409,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7923,6 +8431,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8084,6 +8594,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8114,6 +8627,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8146,6 +8663,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8207,6 +8728,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8229,6 +8752,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8404,6 +8929,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8434,6 +8962,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8466,6 +8998,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8527,6 +9063,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8549,6 +9087,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8724,6 +9264,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8753,6 +9296,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8783,6 +9330,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8841,6 +9392,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8861,6 +9414,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9018,6 +9573,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9047,6 +9605,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9077,6 +9639,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9135,6 +9701,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9155,6 +9723,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9312,6 +9882,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9342,6 +9915,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9374,6 +9951,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9435,6 +10016,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9457,6 +10040,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9632,6 +10217,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9662,6 +10250,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9694,6 +10286,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9755,6 +10351,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9777,6 +10375,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9952,6 +10552,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9982,6 +10585,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10014,6 +10621,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10075,6 +10686,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10097,6 +10710,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10272,6 +10887,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10302,6 +10920,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10334,6 +10956,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10395,6 +11021,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10417,6 +11045,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10592,6 +11222,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10622,6 +11255,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10654,6 +11291,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10715,6 +11356,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10737,6 +11380,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10912,6 +11557,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10942,6 +11590,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10974,6 +11626,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11035,6 +11691,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11057,6 +11715,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11232,6 +11892,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11262,6 +11925,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11294,6 +11961,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11355,6 +12026,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11377,6 +12050,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11552,6 +12227,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11582,6 +12260,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11614,6 +12296,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11675,6 +12361,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11697,6 +12385,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11872,6 +12562,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ; GFX7-LABEL: flat_system_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11887,6 +12580,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11903,6 +12600,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11934,6 +12635,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11948,6 +12651,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12057,6 +12762,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ; GFX7-LABEL: flat_system_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12072,6 +12780,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12088,6 +12800,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12119,6 +12835,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12133,6 +12851,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12242,6 +12962,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX7-LABEL: flat_system_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12259,6 +12982,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12278,6 +13005,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12313,6 +13044,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12330,6 +13063,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12458,6 +13193,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12476,6 +13214,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12497,6 +13239,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12535,6 +13281,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12553,6 +13301,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12696,6 +13446,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ; GFX7-LABEL: flat_system_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12707,6 +13460,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12719,6 +13476,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12742,6 +13503,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12753,6 +13516,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12836,6 +13601,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ; GFX7-LABEL: flat_system_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12847,6 +13615,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12859,6 +13631,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12882,6 +13658,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12893,6 +13671,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12976,6 +13756,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_store(
 ; GFX7-LABEL: flat_system_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12988,6 +13771,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13002,6 +13789,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13028,6 +13819,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13041,6 +13834,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13142,6 +13937,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13154,6 +13952,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13168,6 +13970,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13194,6 +14000,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13207,6 +14015,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13308,6 +14118,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13320,6 +14133,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13333,6 +14150,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13358,6 +14179,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13370,6 +14193,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13460,6 +14285,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13474,6 +14302,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13490,6 +14322,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13519,6 +14355,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13534,6 +14372,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13641,6 +14481,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13654,6 +14497,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13669,6 +14516,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13697,6 +14548,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13711,6 +14564,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13819,6 +14674,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13834,6 +14692,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13852,6 +14714,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13884,6 +14750,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13901,6 +14769,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14026,6 +14896,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14041,6 +14914,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14059,6 +14936,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14091,6 +14972,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14108,6 +14991,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14233,6 +15118,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14250,6 +15138,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14269,6 +15161,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14304,6 +15200,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14321,6 +15219,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14449,6 +15349,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14467,6 +15370,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14488,6 +15395,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14526,6 +15437,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14545,6 +15458,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14691,6 +15606,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14709,6 +15627,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14730,6 +15652,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14768,6 +15694,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14787,6 +15715,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14933,6 +15863,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14958,6 +15891,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14983,6 +15920,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15033,6 +15974,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15048,6 +15991,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15164,6 +16109,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15191,6 +16139,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15219,6 +16171,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15273,6 +16229,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15291,6 +16249,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15424,6 +16384,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15450,6 +16413,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15477,6 +16444,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15530,6 +16501,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15547,6 +16520,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15681,6 +16656,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15709,6 +16687,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15739,6 +16721,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15796,6 +16782,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15816,6 +16804,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15967,6 +16957,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15995,6 +16988,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16025,6 +17022,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16082,6 +17083,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16102,6 +17105,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16253,6 +17258,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16280,6 +17288,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16308,6 +17320,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16362,6 +17378,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16380,6 +17398,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16513,6 +17533,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16540,6 +17563,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16568,6 +17595,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16622,6 +17653,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16640,6 +17673,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16773,6 +17808,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16801,6 +17839,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16831,6 +17873,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16888,6 +17934,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16908,6 +17956,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17059,6 +18109,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17087,6 +18140,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17117,6 +18174,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17174,6 +18235,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17194,6 +18257,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17345,6 +18410,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17373,6 +18441,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17403,6 +18475,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17460,6 +18536,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17480,6 +18558,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17631,6 +18711,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17659,6 +18742,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17689,6 +18776,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17746,6 +18837,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17766,6 +18859,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17917,6 +19012,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17945,6 +19043,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17975,6 +19077,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18032,6 +19138,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18052,6 +19160,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18203,6 +19313,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18231,6 +19344,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18261,6 +19378,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18318,6 +19439,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18338,6 +19461,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18489,6 +19614,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18517,6 +19645,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18547,6 +19679,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18604,6 +19740,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18624,6 +19762,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18775,6 +19915,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18803,6 +19946,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18833,6 +19980,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18890,6 +20041,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18910,6 +20063,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19061,6 +20216,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19089,6 +20247,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19117,6 +20279,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19173,6 +20339,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19191,6 +20359,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19334,6 +20504,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19364,6 +20537,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19395,6 +20572,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19455,6 +20636,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19476,6 +20659,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19638,6 +20823,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19667,6 +20855,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19697,6 +20889,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19756,6 +20952,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19776,6 +20974,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19937,6 +21137,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19968,6 +21171,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20001,6 +21208,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20064,6 +21275,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20087,6 +21300,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20267,6 +21482,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20298,6 +21516,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20331,6 +21553,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20394,6 +21620,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20417,6 +21645,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20597,6 +21827,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20627,6 +21860,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20658,6 +21895,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20718,6 +21959,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20739,6 +21982,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20901,6 +22146,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20931,6 +22179,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20962,6 +22214,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21022,6 +22278,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21043,6 +22301,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21205,6 +22465,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21236,6 +22499,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21269,6 +22536,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21332,6 +22603,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21355,6 +22628,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21535,6 +22810,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21566,6 +22844,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21599,6 +22881,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21662,6 +22948,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21685,6 +22973,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21865,6 +23155,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21896,6 +23189,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21929,6 +23226,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21992,6 +23293,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22015,6 +23318,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22195,6 +23500,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22226,6 +23534,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22259,6 +23571,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22322,6 +23638,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22345,6 +23663,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22525,6 +23845,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22556,6 +23879,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22589,6 +23916,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22652,6 +23983,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22675,6 +24008,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22855,6 +24190,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22886,6 +24224,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22919,6 +24261,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22982,6 +24328,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23005,6 +24353,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23185,6 +24535,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23216,6 +24569,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23249,6 +24606,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23312,6 +24673,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23335,6 +24698,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23515,6 +24880,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23546,6 +24914,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23579,6 +24951,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23642,6 +25018,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23665,6 +25043,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index 8708fcf3b4591..301432a063fbc 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -11,6 +11,9 @@
 define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX7-LABEL: flat_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -27,6 +30,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -44,6 +51,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -152,6 +163,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX7-LABEL: flat_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -181,6 +195,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
@@ -209,6 +227,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_mov_b32 s6, 2
@@ -399,6 +421,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX7-LABEL: flat_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -415,6 +440,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -432,6 +461,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -544,6 +577,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX7-LABEL: flat_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -573,6 +609,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -602,6 +642,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -797,6 +841,9 @@ entry:
 define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ; GFX7-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -813,6 +860,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -830,6 +881,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -936,6 +991,9 @@ entry:
 define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ; GFX7-LABEL: flat_volatile_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -948,6 +1006,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -962,6 +1024,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ;
 ; GFX10-CU-LABEL: flat_volatile_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index c7826181cc8dd..3b931330acdd7 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_wavefront_unordered_load(
 ; GFX7-LABEL: flat_wavefront_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -200,6 +215,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ; GFX7-LABEL: flat_wavefront_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -215,6 +233,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -231,6 +253,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -262,6 +288,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -276,6 +304,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -385,6 +415,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_load(
 ; GFX7-LABEL: flat_wavefront_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -400,6 +433,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -416,6 +453,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -447,6 +488,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -461,6 +504,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -570,6 +615,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ; GFX7-LABEL: flat_wavefront_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -585,6 +633,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -601,6 +653,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -632,6 +688,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -646,6 +704,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -755,6 +815,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_unordered_store(
 ; GFX7-LABEL: flat_wavefront_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -766,6 +829,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -778,6 +845,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -801,6 +872,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -812,6 +885,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -895,6 +970,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ; GFX7-LABEL: flat_wavefront_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -906,6 +984,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -918,6 +1000,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -941,6 +1027,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -952,6 +1040,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1035,6 +1125,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_store(
 ; GFX7-LABEL: flat_wavefront_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1046,6 +1139,10 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1058,6 +1155,10 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1081,6 +1182,8 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1092,6 +1195,8 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1175,6 +1280,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ; GFX7-LABEL: flat_wavefront_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1186,6 +1294,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1198,6 +1310,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1221,6 +1337,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1232,6 +1350,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1315,6 +1435,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1327,6 +1450,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1340,6 +1467,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1365,6 +1496,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1377,6 +1510,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1467,6 +1602,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1479,6 +1617,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1492,6 +1634,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1517,6 +1663,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1529,6 +1677,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1619,6 +1769,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1631,6 +1784,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1644,6 +1801,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1669,6 +1830,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1681,6 +1844,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1771,6 +1936,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1783,6 +1951,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1796,6 +1968,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1821,6 +1997,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1833,6 +2011,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1923,6 +2103,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1935,6 +2118,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1948,6 +2135,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1973,6 +2164,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1985,6 +2178,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2075,6 +2270,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2090,6 +2288,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2106,6 +2308,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2137,6 +2343,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2151,6 +2359,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2260,6 +2470,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2275,6 +2488,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2291,6 +2508,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2322,6 +2543,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2336,6 +2559,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2445,6 +2670,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2460,6 +2688,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2476,6 +2708,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2507,6 +2743,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2521,6 +2759,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2630,6 +2870,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2655,6 +2898,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2680,6 +2927,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2730,6 +2981,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2745,6 +2998,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2861,6 +3116,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2886,6 +3144,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2911,6 +3173,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2961,6 +3227,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2976,6 +3244,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3092,6 +3362,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3117,6 +3390,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3142,6 +3419,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3192,6 +3473,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3207,6 +3490,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3323,6 +3608,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3348,6 +3636,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3373,6 +3665,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3423,6 +3719,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3438,6 +3736,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3554,6 +3854,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3579,6 +3882,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3604,6 +3911,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3654,6 +3965,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3669,6 +3982,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3785,6 +4100,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3810,6 +4128,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3835,6 +4157,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3885,6 +4211,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3900,6 +4228,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4016,6 +4346,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4041,6 +4374,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4066,6 +4403,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4116,6 +4457,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4131,6 +4474,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4247,6 +4592,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4272,6 +4620,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4297,6 +4649,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4347,6 +4703,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4362,6 +4720,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4478,6 +4838,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4503,6 +4866,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4528,6 +4895,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4578,6 +4949,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4593,6 +4966,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4709,6 +5084,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4734,6 +5112,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4759,6 +5141,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4809,6 +5195,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4824,6 +5212,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4940,6 +5330,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4965,6 +5358,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4990,6 +5387,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5040,6 +5441,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5055,6 +5458,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5171,6 +5576,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5196,6 +5604,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5221,6 +5633,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5271,6 +5687,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5286,6 +5704,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5402,6 +5822,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5427,6 +5850,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5452,6 +5879,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5502,6 +5933,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5517,6 +5950,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5633,6 +6068,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5658,6 +6096,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5683,6 +6125,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5733,6 +6179,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5748,6 +6196,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5864,6 +6314,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5889,6 +6342,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5914,6 +6371,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5964,6 +6425,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5979,6 +6442,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6095,6 +6560,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6123,6 +6591,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6151,6 +6623,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6207,6 +6683,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6225,6 +6703,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6368,6 +6848,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6396,6 +6879,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6424,6 +6911,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6480,6 +6971,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6498,6 +6991,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6641,6 +7136,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6669,6 +7167,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6697,6 +7199,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6753,6 +7259,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6771,6 +7279,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6914,6 +7424,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6942,6 +7455,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6970,6 +7487,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7026,6 +7547,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7044,6 +7567,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7187,6 +7712,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7215,6 +7743,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7243,6 +7775,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7299,6 +7835,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7317,6 +7855,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7460,6 +8000,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7488,6 +8031,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7516,6 +8063,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7572,6 +8123,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7590,6 +8143,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7733,6 +8288,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7761,6 +8319,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7789,6 +8351,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7845,6 +8411,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7863,6 +8431,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8006,6 +8576,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8034,6 +8607,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8062,6 +8639,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8118,6 +8699,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8136,6 +8719,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8279,6 +8864,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8307,6 +8895,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8335,6 +8927,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8391,6 +8987,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8409,6 +9007,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8552,6 +9152,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8580,6 +9183,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8608,6 +9215,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8664,6 +9275,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8682,6 +9295,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8825,6 +9440,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8853,6 +9471,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8881,6 +9503,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8937,6 +9563,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8955,6 +9583,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9098,6 +9728,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9126,6 +9759,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9154,6 +9791,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9210,6 +9851,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9228,6 +9871,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9371,6 +10016,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9399,6 +10047,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9427,6 +10079,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9483,6 +10139,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9501,6 +10159,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9644,6 +10304,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9672,6 +10335,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9700,6 +10367,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9756,6 +10427,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9774,6 +10447,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9917,6 +10592,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9945,6 +10623,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9973,6 +10655,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10029,6 +10715,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10047,6 +10735,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10190,6 +10880,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ; GFX7-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10205,6 +10898,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10221,6 +10918,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10252,6 +10953,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10266,6 +10969,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10375,6 +11080,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10390,6 +11098,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10406,6 +11118,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10437,6 +11153,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10451,6 +11169,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10560,6 +11280,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10575,6 +11298,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10591,6 +11318,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10622,6 +11353,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10636,6 +11369,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10745,6 +11480,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10760,6 +11498,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10776,6 +11518,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10807,6 +11553,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10821,6 +11569,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10930,6 +11680,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ; GFX7-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10941,6 +11694,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -10953,6 +11710,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -10976,6 +11737,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -10987,6 +11750,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11070,6 +11835,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11081,6 +11849,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11093,6 +11865,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11116,6 +11892,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11127,6 +11905,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11210,6 +11990,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ; GFX7-LABEL: flat_wavefront_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11221,6 +12004,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11233,6 +12020,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11256,6 +12047,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11267,6 +12060,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11350,6 +12145,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11361,6 +12159,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11373,6 +12175,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11396,6 +12202,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11407,6 +12215,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11490,6 +12300,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11502,6 +12315,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11515,6 +12332,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11540,6 +12361,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11552,6 +12375,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11642,6 +12467,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11654,6 +12482,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11667,6 +12499,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11692,6 +12528,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11704,6 +12542,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11794,6 +12634,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11806,6 +12649,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11819,6 +12666,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11844,6 +12695,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11856,6 +12709,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11946,6 +12801,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11958,6 +12816,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11971,6 +12833,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11996,6 +12862,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12008,6 +12876,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12098,6 +12968,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12110,6 +12983,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12123,6 +13000,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12148,6 +13029,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12160,6 +13043,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12250,6 +13135,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12265,6 +13153,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12281,6 +13173,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12312,6 +13208,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12326,6 +13224,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12435,6 +13335,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12450,6 +13353,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12466,6 +13373,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12497,6 +13408,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12511,6 +13424,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12620,6 +13535,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12635,6 +13553,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12651,6 +13573,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12682,6 +13608,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12696,6 +13624,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12805,6 +13735,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -12830,6 +13763,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -12855,6 +13792,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -12905,6 +13846,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12920,6 +13863,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13036,6 +13981,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13061,6 +14009,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13086,6 +14038,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13136,6 +14092,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13151,6 +14109,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13267,6 +14227,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13292,6 +14255,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13317,6 +14284,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13367,6 +14338,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13382,6 +14355,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13498,6 +14473,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13523,6 +14501,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13548,6 +14530,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13598,6 +14584,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13613,6 +14601,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13729,6 +14719,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13754,6 +14747,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13779,6 +14776,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13829,6 +14830,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13844,6 +14847,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13960,6 +14965,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13985,6 +14993,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14010,6 +15022,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14060,6 +15076,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14075,6 +15093,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14191,6 +15211,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14216,6 +15239,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14241,6 +15268,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14291,6 +15322,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14306,6 +15339,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14422,6 +15457,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14447,6 +15485,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14472,6 +15514,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14522,6 +15568,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14537,6 +15585,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14653,6 +15703,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14678,6 +15731,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14703,6 +15760,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14753,6 +15814,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14768,6 +15831,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14884,6 +15949,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14909,6 +15977,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14934,6 +16006,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14984,6 +16060,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14999,6 +16077,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15115,6 +16195,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15140,6 +16223,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15165,6 +16252,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15215,6 +16306,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15230,6 +16323,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15346,6 +16441,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15371,6 +16469,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15396,6 +16498,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15446,6 +16552,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15461,6 +16569,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15577,6 +16687,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15602,6 +16715,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15627,6 +16744,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15677,6 +16798,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15692,6 +16815,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15808,6 +16933,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15833,6 +16961,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15858,6 +16990,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15908,6 +17044,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15923,6 +17061,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16039,6 +17179,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16064,6 +17207,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16089,6 +17236,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16139,6 +17290,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16154,6 +17307,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16270,6 +17425,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16298,6 +17456,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16326,6 +17488,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16382,6 +17548,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16400,6 +17568,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16543,6 +17713,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16571,6 +17744,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16599,6 +17776,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16655,6 +17836,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16673,6 +17856,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16816,6 +18001,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16844,6 +18032,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16872,6 +18064,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16928,6 +18124,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16946,6 +18144,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17089,6 +18289,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17117,6 +18320,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17145,6 +18352,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17201,6 +18412,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17219,6 +18432,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17362,6 +18577,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17390,6 +18608,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17418,6 +18640,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17474,6 +18700,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17492,6 +18720,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17635,6 +18865,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17663,6 +18896,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17691,6 +18928,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17747,6 +18988,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17765,6 +19008,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17908,6 +19153,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17936,6 +19184,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17964,6 +19216,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18020,6 +19276,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18038,6 +19296,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18181,6 +19441,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18209,6 +19472,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18237,6 +19504,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18293,6 +19564,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18311,6 +19584,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18454,6 +19729,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18482,6 +19760,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18510,6 +19792,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18566,6 +19852,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18584,6 +19872,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18727,6 +20017,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18755,6 +20048,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18783,6 +20080,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18839,6 +20140,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18857,6 +20160,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19000,6 +20305,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19028,6 +20336,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19056,6 +20368,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19112,6 +20428,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19130,6 +20448,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19273,6 +20593,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19301,6 +20624,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19329,6 +20656,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19385,6 +20716,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19403,6 +20736,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19546,6 +20881,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19574,6 +20912,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19602,6 +20944,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19658,6 +21004,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19676,6 +21024,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19819,6 +21169,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19847,6 +21200,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19875,6 +21232,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19931,6 +21292,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19949,6 +21312,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 405168a1b5c24..f2c287471d360 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_workgroup_unordered_load(
 ; GFX7-LABEL: flat_workgroup_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -200,6 +215,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ; GFX7-LABEL: flat_workgroup_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -215,6 +233,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -231,6 +253,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -262,6 +288,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -276,6 +304,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -385,6 +415,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_load(
 ; GFX7-LABEL: flat_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -401,6 +434,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -418,6 +455,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -451,6 +492,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -466,6 +509,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -584,6 +629,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX7-LABEL: flat_workgroup_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -601,6 +649,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -620,6 +672,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -655,6 +711,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -671,6 +729,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -800,6 +860,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_unordered_store(
 ; GFX7-LABEL: flat_workgroup_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -811,6 +874,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -823,6 +890,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -846,6 +917,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -857,6 +930,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -940,6 +1015,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ; GFX7-LABEL: flat_workgroup_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -951,6 +1029,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -963,6 +1045,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -986,6 +1072,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -997,6 +1085,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1080,6 +1170,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_store(
 ; GFX7-LABEL: flat_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1092,6 +1185,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1106,6 +1203,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1131,6 +1232,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1143,6 +1246,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1237,6 +1342,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; GFX7-LABEL: flat_workgroup_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1249,6 +1357,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1263,6 +1375,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1288,6 +1404,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1300,6 +1418,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1394,6 +1514,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1406,6 +1529,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1419,6 +1546,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1444,6 +1575,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1456,6 +1589,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1546,6 +1681,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1559,6 +1697,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1575,6 +1717,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1602,6 +1748,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1615,6 +1763,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1717,6 +1867,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1730,6 +1883,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1745,6 +1902,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1772,6 +1933,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1785,6 +1948,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1886,6 +2051,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1900,6 +2068,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1918,6 +2090,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1947,6 +2123,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1961,6 +2139,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2074,6 +2254,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2088,6 +2271,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2106,6 +2293,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2135,6 +2326,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2149,6 +2342,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2262,6 +2457,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2278,6 +2476,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2295,6 +2497,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2328,6 +2534,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2343,6 +2551,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2461,6 +2671,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2478,6 +2691,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2497,6 +2714,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2532,6 +2753,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2548,6 +2771,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2677,6 +2902,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2694,6 +2922,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2713,6 +2945,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2748,6 +2984,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2764,6 +3002,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2893,6 +3133,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2918,6 +3161,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2943,6 +3190,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2993,6 +3244,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3008,6 +3261,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3124,6 +3379,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3150,6 +3408,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3178,6 +3440,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3230,6 +3496,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3246,6 +3514,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3374,6 +3644,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3400,6 +3673,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3427,6 +3704,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3479,6 +3760,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3495,6 +3778,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3622,6 +3907,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3649,6 +3937,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3679,6 +3971,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3733,6 +4029,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3750,6 +4048,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3889,6 +4189,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3916,6 +4219,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3946,6 +4253,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4000,6 +4311,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4017,6 +4330,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4156,6 +4471,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4182,6 +4500,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4210,6 +4532,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4262,6 +4588,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4278,6 +4606,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4406,6 +4736,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4432,6 +4765,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4460,6 +4797,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4512,6 +4853,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4528,6 +4871,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4656,6 +5001,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4683,6 +5031,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4713,6 +5065,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4767,6 +5123,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4784,6 +5142,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4923,6 +5283,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4950,6 +5313,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4980,6 +5347,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5034,6 +5405,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5051,6 +5424,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5190,6 +5565,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5217,6 +5595,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5247,6 +5629,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5301,6 +5687,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5318,6 +5706,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5457,6 +5847,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5484,6 +5877,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5514,6 +5911,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5568,6 +5969,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5585,6 +5988,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5724,6 +6129,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -5752,6 +6160,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -5780,6 +6192,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -5836,6 +6252,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5854,6 +6272,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5997,6 +6417,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6026,6 +6449,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6055,6 +6482,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6113,6 +6544,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6132,6 +6565,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6284,6 +6719,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6313,6 +6751,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6343,6 +6785,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6401,6 +6847,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6420,6 +6868,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6574,6 +7024,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6604,6 +7057,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6635,6 +7092,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6695,6 +7156,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6715,6 +7178,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6878,6 +7343,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6908,6 +7376,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6939,6 +7411,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6999,6 +7475,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7019,6 +7497,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7182,6 +7662,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7211,6 +7694,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7240,6 +7727,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7298,6 +7789,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7317,6 +7810,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7469,6 +7964,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7498,6 +7996,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7527,6 +8029,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7585,6 +8091,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7604,6 +8112,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7756,6 +8266,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7786,6 +8299,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7817,6 +8334,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7877,6 +8398,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7897,6 +8420,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8060,6 +8585,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8090,6 +8618,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8121,6 +8653,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8181,6 +8717,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8201,6 +8739,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8364,6 +8904,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8394,6 +8937,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8425,6 +8972,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8485,6 +9036,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8505,6 +9058,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8668,6 +9223,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8698,6 +9256,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8729,6 +9291,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8789,6 +9355,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8809,6 +9377,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8972,6 +9542,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9002,6 +9575,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9033,6 +9610,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9093,6 +9674,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9113,6 +9696,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9276,6 +9861,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9306,6 +9894,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9337,6 +9929,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9397,6 +9993,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9417,6 +10015,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9580,6 +10180,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9610,6 +10213,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9641,6 +10248,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9701,6 +10312,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9721,6 +10334,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9884,6 +10499,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9914,6 +10532,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9945,6 +10567,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10005,6 +10631,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10025,6 +10653,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10188,6 +10818,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ; GFX7-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10203,6 +10836,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10219,6 +10856,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10250,6 +10891,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10264,6 +10907,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10373,6 +11018,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10388,6 +11036,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10404,6 +11056,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10435,6 +11091,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10449,6 +11107,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10558,6 +11218,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10573,6 +11236,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10591,6 +11258,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10622,6 +11293,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10636,6 +11309,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10753,6 +11428,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10768,6 +11446,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10788,6 +11470,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10819,6 +11505,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10833,6 +11521,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10958,6 +11648,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ; GFX7-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10969,6 +11662,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -10981,6 +11678,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11004,6 +11705,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11015,6 +11718,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11098,6 +11803,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11109,6 +11817,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11121,6 +11833,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11144,6 +11860,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11155,6 +11873,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11238,6 +11958,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; GFX7-LABEL: flat_workgroup_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11249,6 +11972,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11263,6 +11990,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11286,6 +12017,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11297,6 +12030,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11388,6 +12123,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11399,6 +12137,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11413,6 +12155,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11436,6 +12182,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11447,6 +12195,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11538,6 +12288,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11550,6 +12303,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11563,6 +12320,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11588,6 +12349,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11600,6 +12363,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11690,6 +12455,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11702,6 +12470,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11717,6 +12489,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11742,6 +12518,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11754,6 +12532,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11852,6 +12632,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11864,6 +12647,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11879,6 +12666,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11904,6 +12695,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11916,6 +12709,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12014,6 +12809,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12026,6 +12824,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12043,6 +12845,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12068,6 +12874,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12080,6 +12888,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12186,6 +12996,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12198,6 +13011,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12215,6 +13032,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12240,6 +13061,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12252,6 +13075,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12358,6 +13183,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12373,6 +13201,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12391,6 +13223,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12422,6 +13258,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12436,6 +13274,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12553,6 +13393,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12568,6 +13411,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12588,6 +13435,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12619,6 +13470,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12633,6 +13486,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12758,6 +13613,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12773,6 +13631,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12793,6 +13655,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12824,6 +13690,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12838,6 +13706,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12963,6 +13833,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -12988,6 +13861,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13013,6 +13890,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13063,6 +13944,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13078,6 +13961,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13194,6 +14079,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13219,6 +14107,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13246,6 +14138,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13296,6 +14192,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13311,6 +14209,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13435,6 +14335,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13460,6 +14363,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13487,6 +14394,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13537,6 +14448,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13552,6 +14465,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13676,6 +14591,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13701,6 +14619,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13730,6 +14652,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13780,6 +14706,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13795,6 +14723,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13927,6 +14857,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13952,6 +14885,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13981,6 +14918,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14031,6 +14972,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14046,6 +14989,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14178,6 +15123,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14203,6 +15151,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14230,6 +15182,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14280,6 +15236,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14295,6 +15253,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14419,6 +15379,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14444,6 +15407,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14471,6 +15438,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14521,6 +15492,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14536,6 +15509,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14660,6 +15635,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14685,6 +15663,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14714,6 +15696,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14764,6 +15750,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14779,6 +15767,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14911,6 +15901,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14936,6 +15929,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14965,6 +15962,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15015,6 +16016,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15030,6 +16033,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15162,6 +16167,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15187,6 +16195,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15216,6 +16228,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15266,6 +16282,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15281,6 +16299,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15413,6 +16433,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15438,6 +16461,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15467,6 +16494,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15517,6 +16548,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15532,6 +16565,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15664,6 +16699,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15689,6 +16727,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15718,6 +16760,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15768,6 +16814,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15783,6 +16831,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15915,6 +16965,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15940,6 +16993,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15969,6 +17026,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16019,6 +17080,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16034,6 +17097,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16166,6 +17231,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16191,6 +17259,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16220,6 +17292,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16270,6 +17346,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16285,6 +17363,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16417,6 +17497,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16442,6 +17525,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16471,6 +17558,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16521,6 +17612,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16536,6 +17629,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16668,6 +17763,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16696,6 +17794,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16724,6 +17826,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16780,6 +17886,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16798,6 +17906,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16941,6 +18051,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16969,6 +18082,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16999,6 +18116,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17055,6 +18176,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17073,6 +18196,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17224,6 +18349,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17252,6 +18380,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17282,6 +18414,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17338,6 +18474,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17356,6 +18494,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17507,6 +18647,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17535,6 +18678,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17567,6 +18714,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17623,6 +18774,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17641,6 +18794,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17800,6 +18955,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17828,6 +18986,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17860,6 +19022,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17916,6 +19082,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17934,6 +19102,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18093,6 +19263,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18121,6 +19294,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18151,6 +19328,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18207,6 +19388,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18225,6 +19408,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18376,6 +19561,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18404,6 +19592,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18434,6 +19626,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18490,6 +19686,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18508,6 +19706,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18659,6 +19859,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18687,6 +19890,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18719,6 +19926,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18775,6 +19986,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18793,6 +20006,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18952,6 +20167,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18980,6 +20198,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19012,6 +20234,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19068,6 +20294,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19086,6 +20314,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19245,6 +20475,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19273,6 +20506,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19305,6 +20542,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19361,6 +20602,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19379,6 +20622,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19538,6 +20783,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19566,6 +20814,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19598,6 +20850,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19654,6 +20910,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19672,6 +20930,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19831,6 +21091,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19859,6 +21122,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19891,6 +21158,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19947,6 +21218,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19965,6 +21238,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20124,6 +21399,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20152,6 +21430,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20184,6 +21466,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20240,6 +21526,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20258,6 +21546,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20417,6 +21707,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20445,6 +21738,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20477,6 +21774,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20533,6 +21834,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20551,6 +21854,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20710,6 +22015,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20738,6 +22046,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20770,6 +22082,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20826,6 +22142,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20844,6 +22162,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 793a1bab76d39..769d577a7df16 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -40,6 +40,9 @@ define amdgpu_kernel void @global_agent_unordered_load(
 ;
 ; GFX7-LABEL: global_agent_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -236,6 +239,9 @@ define amdgpu_kernel void @global_agent_monotonic_load(
 ;
 ; GFX7-LABEL: global_agent_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -433,6 +439,9 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ;
 ; GFX7-LABEL: global_agent_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -650,6 +659,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -872,6 +884,9 @@ define amdgpu_kernel void @global_agent_unordered_store(
 ;
 ; GFX7-LABEL: global_agent_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1039,6 +1054,9 @@ define amdgpu_kernel void @global_agent_monotonic_store(
 ;
 ; GFX7-LABEL: global_agent_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1207,6 +1225,9 @@ define amdgpu_kernel void @global_agent_release_store(
 ;
 ; GFX7-LABEL: global_agent_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1399,6 +1420,9 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1590,6 +1614,9 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1750,6 +1777,9 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1936,6 +1966,9 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2121,6 +2154,9 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2333,6 +2369,9 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2545,6 +2584,9 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2752,6 +2794,9 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2983,6 +3028,9 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3214,6 +3262,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3440,6 +3491,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3692,6 +3746,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3943,6 +4000,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4221,6 +4281,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4498,6 +4561,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4751,6 +4817,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5005,6 +5074,9 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5283,6 +5355,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5561,6 +5636,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5839,6 +5917,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6117,6 +6198,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6395,6 +6479,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6673,6 +6760,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6951,6 +7041,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7229,6 +7322,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7485,6 +7581,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7761,6 +7860,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8042,6 +8144,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8343,6 +8448,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8643,6 +8751,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8919,6 +9030,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9196,6 +9310,9 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9497,6 +9614,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9798,6 +9918,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10099,6 +10222,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10400,6 +10526,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10701,6 +10830,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11002,6 +11134,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11303,6 +11438,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11602,6 +11740,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11798,6 +11939,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11995,6 +12139,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12212,6 +12359,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12434,6 +12584,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12601,6 +12754,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12769,6 +12925,9 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12961,6 +13120,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13152,6 +13314,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13312,6 +13477,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13498,6 +13666,9 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13683,6 +13854,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13895,6 +14069,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14107,6 +14284,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14314,6 +14494,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14545,6 +14728,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14776,6 +14962,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15002,6 +15191,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15254,6 +15446,9 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15505,6 +15700,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15783,6 +15981,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16060,6 +16261,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16313,6 +16517,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16567,6 +16774,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16845,6 +17055,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17123,6 +17336,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17401,6 +17617,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17679,6 +17898,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17957,6 +18179,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18235,6 +18460,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18513,6 +18741,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18791,6 +19022,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19047,6 +19281,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19324,6 +19561,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19625,6 +19865,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19925,6 +20168,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20201,6 +20447,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20478,6 +20727,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20779,6 +21031,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21080,6 +21335,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21381,6 +21639,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21682,6 +21943,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21983,6 +22247,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22284,6 +22551,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22585,6 +22855,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 9b2b3a4cfa9ba..b97715e0ed8c9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -35,6 +35,9 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ;
 ; GFX7-LABEL: global_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -240,6 +243,9 @@ define amdgpu_kernel void @global_nontemporal_load_1(
 ;
 ; GFX7-LABEL: global_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -498,6 +504,9 @@ define amdgpu_kernel void @global_nontemporal_store_0(
 ;
 ; GFX7-LABEL: global_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -698,6 +707,9 @@ define amdgpu_kernel void @global_nontemporal_store_1(
 ;
 ; GFX7-LABEL: global_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -945,6 +957,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
 ;
 ; GFX7-LABEL: global_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index afc46fbc23a67..1d7ba89e33959 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -40,6 +40,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
 ;
 ; GFX7-LABEL: global_singlethread_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -236,6 +239,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -432,6 +438,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -628,6 +637,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -817,6 +829,9 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
 ;
 ; GFX7-LABEL: global_singlethread_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -984,6 +999,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1151,6 +1169,9 @@ define amdgpu_kernel void @global_singlethread_release_store(
 ;
 ; GFX7-LABEL: global_singlethread_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1318,6 +1339,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1485,6 +1509,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1643,6 +1670,9 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1801,6 +1831,9 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1959,6 +1992,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2117,6 +2153,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2277,6 +2316,9 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2463,6 +2505,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2649,6 +2694,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2837,6 +2885,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3061,6 +3112,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3285,6 +3339,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3509,6 +3566,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3733,6 +3793,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3957,6 +4020,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4181,6 +4247,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4405,6 +4474,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4629,6 +4701,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4853,6 +4928,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5077,6 +5155,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5301,6 +5382,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5525,6 +5609,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5749,6 +5836,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5973,6 +6063,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6200,6 +6293,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6455,6 +6551,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6710,6 +6809,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6965,6 +7067,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7220,6 +7325,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7475,6 +7583,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7730,6 +7841,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7985,6 +8099,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8240,6 +8357,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8495,6 +8615,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8750,6 +8873,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9005,6 +9131,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9260,6 +9389,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9515,6 +9647,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9770,6 +9905,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10025,6 +10163,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10221,6 +10362,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10417,6 +10561,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10613,6 +10760,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10802,6 +10952,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10969,6 +11122,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11136,6 +11292,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11303,6 +11462,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11470,6 +11632,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11628,6 +11793,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11786,6 +11954,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11944,6 +12115,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12102,6 +12276,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12262,6 +12439,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12448,6 +12628,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12634,6 +12817,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12822,6 +13008,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13046,6 +13235,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13270,6 +13462,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13494,6 +13689,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13718,6 +13916,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13942,6 +14143,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14166,6 +14370,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14390,6 +14597,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14614,6 +14824,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14838,6 +15051,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15062,6 +15278,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15286,6 +15505,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15510,6 +15732,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15734,6 +15959,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15958,6 +16186,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16185,6 +16416,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16440,6 +16674,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16695,6 +16932,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16950,6 +17190,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17205,6 +17448,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17460,6 +17706,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17715,6 +17964,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17970,6 +18222,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18225,6 +18480,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18480,6 +18738,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18735,6 +18996,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18990,6 +19254,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19245,6 +19512,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19500,6 +19770,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19755,6 +20028,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 1dbf0b2e97783..70a4f6a8f09c5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -40,6 +40,9 @@ define amdgpu_kernel void @global_system_unordered_load(
 ;
 ; GFX7-LABEL: global_system_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -236,6 +239,9 @@ define amdgpu_kernel void @global_system_monotonic_load(
 ;
 ; GFX7-LABEL: global_system_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -433,6 +439,9 @@ define amdgpu_kernel void @global_system_acquire_load(
 ;
 ; GFX7-LABEL: global_system_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -652,6 +661,9 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ;
 ; GFX7-LABEL: global_system_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -876,6 +888,9 @@ define amdgpu_kernel void @global_system_unordered_store(
 ;
 ; GFX7-LABEL: global_system_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1043,6 +1058,9 @@ define amdgpu_kernel void @global_system_monotonic_store(
 ;
 ; GFX7-LABEL: global_system_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1211,6 +1229,9 @@ define amdgpu_kernel void @global_system_release_store(
 ;
 ; GFX7-LABEL: global_system_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1405,6 +1426,9 @@ define amdgpu_kernel void @global_system_seq_cst_store(
 ;
 ; GFX7-LABEL: global_system_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1598,6 +1622,9 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1758,6 +1785,9 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1946,6 +1976,9 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2133,6 +2166,9 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2349,6 +2385,9 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2565,6 +2604,9 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2774,6 +2816,9 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3009,6 +3054,9 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3244,6 +3292,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3470,6 +3521,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3724,6 +3778,9 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3977,6 +4034,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4259,6 +4319,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4540,6 +4603,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4795,6 +4861,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5051,6 +5120,9 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5333,6 +5405,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5615,6 +5690,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5897,6 +5975,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6179,6 +6260,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6435,6 +6519,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6714,6 +6801,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7019,6 +7109,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7323,6 +7416,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7601,6 +7697,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7880,6 +7979,9 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8185,6 +8287,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8490,6 +8595,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8795,6 +8903,9 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9100,6 +9211,9 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9405,6 +9519,9 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9710,6 +9827,9 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10015,6 +10135,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10318,6 +10441,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_system_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10514,6 +10640,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10711,6 +10840,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10930,6 +11062,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11154,6 +11289,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_system_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11321,6 +11459,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11489,6 +11630,9 @@ define amdgpu_kernel void @global_system_one_as_release_store(
 ;
 ; GFX7-LABEL: global_system_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11683,6 +11827,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11876,6 +12023,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12036,6 +12186,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12224,6 +12377,9 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12411,6 +12567,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12627,6 +12786,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12843,6 +13005,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13052,6 +13217,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13287,6 +13455,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13522,6 +13693,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13748,6 +13922,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14002,6 +14179,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14255,6 +14435,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14537,6 +14720,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14818,6 +15004,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15073,6 +15262,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15329,6 +15521,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15611,6 +15806,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15893,6 +16091,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16175,6 +16376,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16457,6 +16661,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16739,6 +16946,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17021,6 +17231,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17303,6 +17516,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17585,6 +17801,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17841,6 +18060,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18119,6 +18341,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18402,6 +18627,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18707,6 +18935,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19011,6 +19242,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19289,6 +19523,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19568,6 +19805,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19873,6 +20113,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20178,6 +20421,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20483,6 +20729,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20788,6 +21037,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21093,6 +21345,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21398,6 +21653,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21703,6 +21961,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 11206cb695aa4..d7c109845a825 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -36,6 +36,9 @@ define amdgpu_kernel void @global_volatile_load_0(
 ;
 ; GFX7-LABEL: global_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -190,6 +193,9 @@ define amdgpu_kernel void @global_volatile_load_1(
 ;
 ; GFX7-LABEL: global_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -381,6 +387,9 @@ define amdgpu_kernel void @global_volatile_store_0(
 ;
 ; GFX7-LABEL: global_volatile_store_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -544,6 +553,9 @@ define amdgpu_kernel void @global_volatile_store_1(
 ;
 ; GFX7-LABEL: global_volatile_store_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -741,6 +753,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
 ;
 ; GFX7-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -884,6 +899,9 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ;
 ; GFX7-LABEL: global_volatile_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index f805e2cf37006..3c1e0e1d3d9af 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -40,6 +40,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
 ;
 ; GFX7-LABEL: global_wavefront_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -236,6 +239,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -432,6 +438,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -628,6 +637,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -817,6 +829,9 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
 ;
 ; GFX7-LABEL: global_wavefront_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -984,6 +999,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1151,6 +1169,9 @@ define amdgpu_kernel void @global_wavefront_release_store(
 ;
 ; GFX7-LABEL: global_wavefront_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1318,6 +1339,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1485,6 +1509,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1643,6 +1670,9 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1801,6 +1831,9 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1959,6 +1992,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2117,6 +2153,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2277,6 +2316,9 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2463,6 +2505,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2649,6 +2694,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2837,6 +2885,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3061,6 +3112,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3285,6 +3339,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3509,6 +3566,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3733,6 +3793,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3957,6 +4020,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4181,6 +4247,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4405,6 +4474,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4629,6 +4701,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4853,6 +4928,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5077,6 +5155,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5301,6 +5382,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5525,6 +5609,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5749,6 +5836,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5973,6 +6063,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6200,6 +6293,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6455,6 +6551,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6710,6 +6809,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6965,6 +7067,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7220,6 +7325,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7475,6 +7583,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7730,6 +7841,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7985,6 +8099,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8240,6 +8357,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8495,6 +8615,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8750,6 +8873,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9005,6 +9131,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9260,6 +9389,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9515,6 +9647,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9770,6 +9905,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10025,6 +10163,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10221,6 +10362,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10417,6 +10561,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10613,6 +10760,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10802,6 +10952,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10969,6 +11122,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11136,6 +11292,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11303,6 +11462,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11470,6 +11632,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11628,6 +11793,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11786,6 +11954,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11944,6 +12115,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12102,6 +12276,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12262,6 +12439,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12448,6 +12628,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12634,6 +12817,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12822,6 +13008,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13046,6 +13235,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13270,6 +13462,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13494,6 +13689,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13718,6 +13916,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13942,6 +14143,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14166,6 +14370,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14390,6 +14597,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14614,6 +14824,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14838,6 +15051,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15062,6 +15278,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15286,6 +15505,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15510,6 +15732,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15734,6 +15959,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15958,6 +16186,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16185,6 +16416,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16440,6 +16674,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16695,6 +16932,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16950,6 +17190,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17205,6 +17448,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17460,6 +17706,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17715,6 +17964,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17970,6 +18222,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18225,6 +18480,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18480,6 +18738,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18735,6 +18996,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18990,6 +19254,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19245,6 +19512,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19500,6 +19770,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19755,6 +20028,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 643684b7550ad..d1d52b217ff70 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -40,6 +40,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
 ;
 ; GFX7-LABEL: global_workgroup_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -236,6 +239,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -432,6 +438,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -636,6 +645,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -841,6 +853,9 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
 ;
 ; GFX7-LABEL: global_workgroup_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1008,6 +1023,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1176,6 +1194,9 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ;
 ; GFX7-LABEL: global_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1361,6 +1382,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1545,6 +1569,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1703,6 +1730,9 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1872,6 +1902,9 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2048,6 +2081,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2234,6 +2270,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2421,6 +2460,9 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2615,6 +2657,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2826,6 +2871,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3038,6 +3086,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3262,6 +3313,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3497,6 +3551,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3739,6 +3796,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3991,6 +4051,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4242,6 +4305,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4476,6 +4542,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4711,6 +4780,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4963,6 +5035,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5215,6 +5290,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5467,6 +5545,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5719,6 +5800,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5971,6 +6055,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6223,6 +6310,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6475,6 +6565,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6729,6 +6822,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6984,6 +7080,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7247,6 +7346,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7520,6 +7622,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7800,6 +7905,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8079,6 +8187,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8341,6 +8452,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8604,6 +8718,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8884,6 +9001,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9164,6 +9284,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9444,6 +9567,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9724,6 +9850,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10004,6 +10133,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10284,6 +10416,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10564,6 +10699,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10843,6 +10981,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11039,6 +11180,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11235,6 +11379,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11438,6 +11585,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11640,6 +11790,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11807,6 +11960,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11974,6 +12130,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12151,6 +12310,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12328,6 +12490,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12486,6 +12651,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12654,6 +12822,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12822,6 +12993,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13000,6 +13174,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13180,6 +13357,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13373,6 +13553,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13576,6 +13759,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13781,6 +13967,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14005,6 +14194,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14239,6 +14431,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14473,6 +14668,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14717,6 +14915,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14961,6 +15162,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15195,6 +15399,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15429,6 +15636,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15673,6 +15883,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15917,6 +16130,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16161,6 +16377,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16405,6 +16624,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16649,6 +16871,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16893,6 +17118,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17137,6 +17365,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17384,6 +17615,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17639,6 +17873,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17901,6 +18138,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18166,6 +18406,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18438,6 +18681,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18710,6 +18956,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18972,6 +19221,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19234,6 +19486,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19506,6 +19761,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19778,6 +20036,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20050,6 +20311,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20322,6 +20586,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20594,6 +20861,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20866,6 +21136,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21138,6 +21411,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index ba9711333a194..403e3012baa22 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -37,6 +37,9 @@ define amdgpu_kernel void @local_nontemporal_load_0(
 ;
 ; GFX7-LABEL: local_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -238,6 +241,9 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ;
 ; GFX7-LABEL: local_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -881,6 +887,9 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
 ;
 ; GFX7-LABEL: local_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 2a15a0d0727fd..99dafea81ae33 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -33,6 +33,9 @@ define amdgpu_kernel void @local_volatile_load_0(
 ;
 ; GFX7-LABEL: local_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -178,6 +181,9 @@ define amdgpu_kernel void @local_volatile_load_1(
 ;
 ; GFX7-LABEL: local_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 4e08065e879fd..9166046b55fca 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -37,7 +37,10 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX7-LABEL: private_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -53,7 +56,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -69,7 +72,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -110,7 +113,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -126,7 +129,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -246,7 +249,10 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX7-LABEL: private_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -264,7 +270,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
@@ -282,7 +288,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
@@ -327,7 +333,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
@@ -348,7 +354,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
@@ -499,7 +505,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX7-LABEL: private_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -514,7 +520,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -530,7 +536,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -565,7 +571,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -581,7 +587,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -695,7 +701,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX7-LABEL: private_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -712,7 +718,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -728,7 +734,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -765,7 +771,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -784,7 +790,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -929,7 +935,10 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX7-LABEL: private_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -945,7 +954,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_volatile_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -961,7 +970,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_volatile_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1002,7 +1011,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1018,7 +1027,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index a68b5f36b806e..ddee2f39c4ba1 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -37,7 +37,10 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX7-LABEL: private_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -53,7 +56,7 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX10-WGP-LABEL: private_volatile_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -69,7 +72,7 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX10-CU-LABEL: private_volatile_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -196,7 +199,10 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX7-LABEL: private_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -214,7 +220,7 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX10-WGP-LABEL: private_volatile_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
@@ -232,7 +238,7 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX10-CU-LABEL: private_volatile_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
@@ -378,7 +384,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX7-LABEL: private_volatile_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -394,7 +400,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX10-WGP-LABEL: private_volatile_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -411,7 +417,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX10-CU-LABEL: private_volatile_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -539,7 +545,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX7-LABEL: private_volatile_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -557,7 +563,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX10-WGP-LABEL: private_volatile_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -574,7 +580,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX10-CU-LABEL: private_volatile_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
index d070dc3b770f8..1fd311f225db4 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
@@ -203,13 +203,13 @@ attributes #5 = { "amdgpu-flat-work-group-size"="128,512" }
 attributes #6 = { "amdgpu-flat-work-group-size"="512,512" }
 attributes #7 = { "amdgpu-flat-work-group-size"="64,256" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index f62f1d57aec8e..9577f2a932f96 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -399,26 +399,26 @@ attributes #17 = { "amdgpu-waves-per-eu"="5,8" }
 attributes #18 = { "amdgpu-waves-per-eu"="9,10" }
 attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
index c1d647c5d3b90..9758afa3b4991 100644
--- a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
@@ -19,5 +19,5 @@ define void @hoge()  {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index 384a9c4043a1d..ab51d6e249e90 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -191,11 +191,11 @@ define amdgpu_kernel void @kernel_lds_recursion() {
 !1 = !{i32 1, !"amdhsa_code_object_version", i32 400}
 
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index d430ba758572d..2292b45ddf15a 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -7,7 +7,7 @@
 define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 {
 ; GCN-LABEL: partial_no_vgprs_last_sgpr_spill:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 s0, s0, s13
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index 539cfc71a80f9..6e7ee42d4ec8b 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -73,7 +73,7 @@ define amdgpu_kernel void @test_simple_indirect_call() {
 ;.
 ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" }
 ;.
-; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
 ;.
 
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
index 049db01badacf..2ab48479fa1b0 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
@@ -31,6 +31,6 @@ define amdgpu_kernel void @kernel1() #1 {
 
 attributes #0 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
index c9387f196dff9..cc58d34a8b255 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
@@ -98,7 +98,7 @@ define amdgpu_kernel void @kernel2() #0 {
 attributes #0 = { "uniform-work-group-size"="true" }
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR2]] = { "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
index 7183da2c5efc3..33298bde89b97 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
@@ -41,6 +41,6 @@ define amdgpu_kernel void @kernel3() #2 {
 
 attributes #2 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
index 6ed04cf63d20b..6fe85cd7c9e65 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @kernel2() #2 {
 
 attributes #1 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
index d5ba2fd617c6e..f6ab402e01232 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
@@ -52,8 +52,8 @@ attributes #0 = { nounwind }
 attributes #1 = { "uniform-work-group-size"="false" }
 attributes #2 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
index 7f0dfeaf75c80..37e3376ef9d50 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
@@ -101,7 +101,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %m) #1 {
 attributes #0 = { nounwind readnone }
 attributes #1 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
index 8616c73ad51c1..7f83686bc756e 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
@@ -61,6 +61,6 @@ define amdgpu_kernel void @kernel3() #0 {
 
 attributes #0 = { "uniform-work-group-size"="false" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index 20dc5ad5c8665..257566053be14 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -11,7 +11,7 @@
 define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-LABEL: __omp_offloading_16_dd2df_main_l9:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_add_u32 s0, s0, s13
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v0



More information about the llvm-commits mailing list