[clang] [llvm] [AMDGPU] Infer amdgpu-no-flat-scratch-init attribute in AMDGPUAttributor (PR #94647)

Jun Wang via cfe-commits cfe-commits at lists.llvm.org
Wed Sep 18 15:57:16 PDT 2024


https://github.com/jwanggit86 updated https://github.com/llvm/llvm-project/pull/94647

>From f723823e7623e275ce7180ecb7b5c3d332b01c3c Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Thu, 6 Jun 2024 10:45:12 -0700
Subject: [PATCH 1/8] [AMDGPU] Infer amdgpu-no-flat-scratch-init attribute in
 AMDGPUAttributor

The AMDGPUAnnotateKernelFeatures pass infers the "amdgpu-calls" and
"amdgpu-stack-objects" attributes, which are used to infer whether we need to
initialize flat scratch. This is, however, not precise. Instead, we should use
AMDGPUAttributor and infer amdgpu-no-flat-scratch-init on kernels.
Refer to https://github.com/llvm/llvm-project/issues/63586 .
---
 llvm/lib/Target/AMDGPU/AMDGPUAttributes.def   |    1 +
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |   43 +
 ...licit-kernarg-backend-usage-global-isel.ll |   30 +-
 .../AMDGPU/addrspacecast-constantexpr.ll      |    6 +-
 .../AMDGPU/amdgpu-attributor-no-agpr.ll       |   21 +-
 .../annotate-kernel-features-hsa-call.ll      |  112 +-
 .../AMDGPU/annotate-kernel-features-hsa.ll    |   44 +-
 .../AMDGPU/annotate-kernel-features.ll        |   18 +-
 .../attributor-flatscratchinit-globalisel.ll  | 1028 ++++++++++++
 .../AMDGPU/attributor-flatscratchinit.ll      |  914 +++++++++++
 llvm/test/CodeGen/AMDGPU/attributor-noopt.ll  |    2 +-
 .../AMDGPU/call-graph-register-usage.ll       |    8 +-
 .../callee-special-input-sgprs-fixed-abi.ll   |   36 +-
 .../CodeGen/AMDGPU/direct-indirect-call.ll    |    4 +-
 .../AMDGPU/duplicate-attribute-indirect.ll    |    4 +-
 .../test/CodeGen/AMDGPU/flat-address-space.ll |    8 +-
 .../AMDGPU/implicit-kernarg-backend-usage.ll  |   31 +-
 .../AMDGPU/implicitarg-offset-attributes.ll   |   30 +-
 llvm/test/CodeGen/AMDGPU/ipra.ll              |    2 +-
 llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll  |  248 +--
 .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll       |   29 +-
 .../AMDGPU/lower-module-lds-via-hybrid.ll     |   15 +-
 .../AMDGPU/lower-module-lds-via-table.ll      |   15 +-
 .../AMDGPU/memory-legalizer-flat-agent.ll     | 1380 +++++++++++++++++
 .../memory-legalizer-flat-nontemporal.ll      |   75 +
 .../memory-legalizer-flat-singlethread.ll     | 1380 +++++++++++++++++
 .../AMDGPU/memory-legalizer-flat-system.ll    | 1380 +++++++++++++++++
 .../AMDGPU/memory-legalizer-flat-volatile.ll  |   66 +
 .../AMDGPU/memory-legalizer-flat-wavefront.ll | 1365 ++++++++++++++++
 .../AMDGPU/memory-legalizer-flat-workgroup.ll | 1320 ++++++++++++++++
 .../AMDGPU/memory-legalizer-global-agent.ll   |  273 ++++
 .../memory-legalizer-global-nontemporal.ll    |   15 +
 .../memory-legalizer-global-singlethread.ll   |  276 ++++
 .../AMDGPU/memory-legalizer-global-system.ll  |  261 ++++
 .../memory-legalizer-global-volatile.ll       |   18 +
 .../memory-legalizer-global-wavefront.ll      |  276 ++++
 .../memory-legalizer-global-workgroup.ll      |  276 ++++
 .../memory-legalizer-local-nontemporal.ll     |    9 +
 .../AMDGPU/memory-legalizer-local-volatile.ll |    6 +
 .../memory-legalizer-private-nontemporal.ll   |   59 +-
 .../memory-legalizer-private-volatile.ll      |   30 +-
 .../AMDGPU/propagate-flat-work-group-size.ll  |   18 +-
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  |   44 +-
 .../AMDGPU/recursive_global_initializer.ll    |    2 +-
 .../AMDGPU/remove-no-kernel-id-attribute.ll   |    8 +-
 .../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll     |    2 +-
 .../CodeGen/AMDGPU/simple-indirect-call.ll    |    4 +-
 .../uniform-work-group-attribute-missing.ll   |    4 +-
 .../AMDGPU/uniform-work-group-multistep.ll    |    4 +-
 ...niform-work-group-nested-function-calls.ll |    4 +-
 ...ork-group-prevent-attribute-propagation.ll |    6 +-
 .../uniform-work-group-propagate-attribute.ll |    4 +-
 .../uniform-work-group-recursion-test.ll      |    6 +-
 .../CodeGen/AMDGPU/uniform-work-group-test.ll |    4 +-
 .../AMDGPU/vgpr-spill-placement-issue61083.ll |    2 +-
 55 files changed, 10758 insertions(+), 468 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
index bacc8e4e821e5d..8c1c8219690ba5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributes.def
@@ -30,5 +30,6 @@ AMDGPU_ATTRIBUTE(WORKITEM_ID_Z, "amdgpu-no-workitem-id-z")
 AMDGPU_ATTRIBUTE(LDS_KERNEL_ID, "amdgpu-no-lds-kernel-id")
 AMDGPU_ATTRIBUTE(DEFAULT_QUEUE, "amdgpu-no-default-queue")
 AMDGPU_ATTRIBUTE(COMPLETION_ACTION, "amdgpu-no-completion-action")
+AMDGPU_ATTRIBUTE(FLAT_SCRATCH_INIT, "amdgpu-no-flat-scratch-init")
 
 #undef AMDGPU_ATTRIBUTE
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index d65e0ae92308e6..2017ab70c96f61 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -434,6 +434,19 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       indicatePessimisticFixpoint();
       return;
     }
+
+    bool HasAllocaOrASCast = false;
+    for (BasicBlock &BB : *F) {
+      for (Instruction &I : BB) {
+        if (isa<AllocaInst>(I) || isa<AddrSpaceCastInst>(I)) {
+          HasAllocaOrASCast = true;
+          removeAssumedBits(FLAT_SCRATCH_INIT);
+          break;
+        }
+      }
+      if (HasAllocaOrASCast)
+        break;
+    }
   }
 
   ChangeStatus updateImpl(Attributor &A) override {
@@ -520,6 +533,9 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
       removeAssumedBits(COMPLETION_ACTION);
 
+    if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(A))
+      removeAssumedBits(FLAT_SCRATCH_INIT);
+
     return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
                                        : ChangeStatus::UNCHANGED;
   }
@@ -678,6 +694,33 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     return !A.checkForAllCallLikeInstructions(DoesNotRetrieve, *this,
                                               UsedAssumedInformation);
   }
+
+  // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
+  // not to be set.
+  bool needFlatScratchInit(Attributor &A) {
+    // This is called on each callee; false means callee shouldn't have
+    // no-flat-scratch-init.
+    auto CheckForNoFlatScratchInit = [&](Instruction &I) {
+      const auto &CB = cast<CallBase>(I);
+      const Value *CalleeOp = CB.getCalledOperand();
+      const Function *Callee = dyn_cast<Function>(CalleeOp);
+      if (!Callee) // indirect call
+        return CB.isInlineAsm();
+
+      if (Callee->isIntrinsic())
+        return true;
+
+      const auto *CalleeInfo = A.getAAFor<AAAMDAttributes>(
+          *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
+      return CalleeInfo && CalleeInfo->isAssumed(FLAT_SCRATCH_INIT);
+    };
+
+    bool UsedAssumedInformation = false;
+    // If any callee is false (i.e. need FlatScratchInit),
+    // checkForAllCallLikeInstructions returns false
+    return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
+                                              UsedAssumedInformation);
+  }
 };
 
 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 9443b39dcdc033..80e9ae33d6d45e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -10,9 +10,11 @@
 define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
 ; GFX8V4-LABEL: addrspacecast:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x40
-; GFX8V4-NEXT:    v_mov_b32_e32 v2, 1
+; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x40
+; GFX8V4-NEXT:    s_add_i32 s8, s8, s11
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s9
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_mov_b32 s4, s0
 ; GFX8V4-NEXT:    s_mov_b32 s5, s3
@@ -23,6 +25,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], s[6:7], 0
+; GFX8V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
@@ -35,9 +38,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX8V5-LABEL: addrspacecast:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0xc8
-; GFX8V5-NEXT:    v_mov_b32_e32 v2, 1
+; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xc8
+; GFX8V5-NEXT:    s_add_i32 s6, s6, s9
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_mov_b32 s4, s0
 ; GFX8V5-NEXT:    s_mov_b32 s5, s2
@@ -47,6 +52,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], s[2:3], 0
+; GFX8V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
@@ -59,10 +65,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX9V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_mov_b32 s2, s0
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
@@ -71,6 +78,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V4-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
+; GFX9V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
@@ -83,10 +91,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX9V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_mov_b32 s2, s0
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
@@ -95,6 +104,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V5-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
+; GFX9V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index cff9ce05066793..96bbcb7ed2149a 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -233,9 +233,9 @@ attributes #1 = { nounwind }
 ; AKF_HSA: attributes #[[ATTR1]] = { nounwind }
 ;.
 ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index 33b1cc65dc5699..5ace66fd2dd762 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -116,7 +116,7 @@ define amdgpu_kernel void @kernel_calls_extern() {
 define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
 ; CHECK-SAME: ) #[[ATTR4]] {
-; CHECK-NEXT:    call void @unknown() #[[ATTR9:[0-9]+]]
+; CHECK-NEXT:    call void @unknown() #[[ATTR10:[0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
   call void @unknown() #0
@@ -136,7 +136,7 @@ define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
 define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
 ; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] {
-; CHECK-NEXT:    call void [[INDIRECT]]() #[[ATTR9]]
+; CHECK-NEXT:    call void [[INDIRECT]]() #[[ATTR10]]
 ; CHECK-NEXT:    ret void
 ;
   call void %indirect() #0
@@ -229,7 +229,7 @@ define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) {
 
 define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 ; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr(
-; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR7:[0-9]+]] {
 ; CHECK-NEXT:    [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty
 ; CHECK-NEXT:    call void [[FPTR]]()
 ; CHECK-NEXT:    ret void
@@ -242,14 +242,15 @@ define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 
 attributes #0 = { "amdgpu-no-agpr" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3:[0-9]+]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR4]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-no-agpr" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index 3d4ae84d9c698e..23294eacbe6cb3 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -530,7 +530,7 @@ define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null)
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -545,7 +545,7 @@ define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null)
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -593,7 +593,7 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; ATTRIBUTOR_HSA-NEXT:    store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -611,7 +611,7 @@ define void @use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; ATTRIBUTOR_HSA-NEXT:    store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -628,7 +628,7 @@ define void @func_indirect_use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_implicitarg_ptr()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -645,7 +645,7 @@ define internal void @defined.func() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@defined.func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -658,7 +658,7 @@ define void @func_call_external() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_external
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @external.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -673,7 +673,7 @@ define void @func_call_defined() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_defined
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @defined.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -687,8 +687,8 @@ define void @func_call_asm() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
-; ATTRIBUTOR_HSA-NEXT:    call void asm sideeffect "", ""() #[[ATTR28:[0-9]+]]
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] {
+; ATTRIBUTOR_HSA-NEXT:    call void asm sideeffect "", ""() #[[ATTR31:[0-9]+]]
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   call void asm sideeffect "", ""() #3
@@ -702,7 +702,7 @@ define amdgpu_kernel void @kern_call_external() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_external
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @external.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -717,7 +717,7 @@ define amdgpu_kernel void @func_kern_defined() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_kern_defined
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @defined.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -769,7 +769,7 @@ define float @func_indirect_call(ptr %fptr) #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call
-; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR19]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float [[FPTR]]()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -788,7 +788,7 @@ define float @func_extern_call() #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @extern()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -806,7 +806,7 @@ define float @func_null_call(ptr %fptr) #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call
-; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR19]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float null()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -827,7 +827,7 @@ define float @func_other_intrinsic_call(float %arg) #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call
-; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR17]] {
+; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR20]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]])
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -845,7 +845,7 @@ define amdgpu_kernel void @kern_sanitize_address() #4 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) null, align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -861,7 +861,7 @@ define void @func_sanitize_address() #4 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) null, align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -877,7 +877,7 @@ define void @func_indirect_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR24:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -893,7 +893,7 @@ define amdgpu_kernel void @kern_indirect_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -912,7 +912,7 @@ define amdgpu_kernel void @kern_decl_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR21]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @extern_func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -928,7 +928,7 @@ define internal void @enqueue_block_def() #6 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR28:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -941,7 +941,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR26:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR29:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @enqueue_block_decl()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -956,7 +956,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_def() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR30:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @enqueue_block_def()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -969,7 +969,7 @@ define void @unused_enqueue_block() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR30]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -980,7 +980,7 @@ define internal void @known_func() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@known_func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR30]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -994,8 +994,8 @@ define amdgpu_kernel void @kern_callsite_enqueue_block() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR27]] {
-; ATTRIBUTOR_HSA-NEXT:    call void @known_func() #[[ATTR29:[0-9]+]]
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR30]] {
+; ATTRIBUTOR_HSA-NEXT:    call void @known_func() #[[ATTR32:[0-9]+]]
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   call void @known_func() #6
@@ -1024,36 +1024,40 @@ attributes #6 = { "enqueued-block" }
 ; AKF_HSA: attributes #[[ATTR7]] = { "enqueued-block" }
 ; AKF_HSA: attributes #[[ATTR8]] = { "amdgpu-calls" }
 ;.
+
 ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR23:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR24:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR27]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { nounwind }
-; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "enqueued-block" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR24]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR26:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR27:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR30]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR31]] = { nounwind }
+; ATTRIBUTOR_HSA: attributes #[[ATTR32]] = { "enqueued-block" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 879bceaef97c00..b153069fbee16b 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -452,7 +452,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 
 define amdgpu_kernel void @use_flat_to_group_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_group_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
 ; HSA-NEXT:    store volatile i32 0, ptr addrspace(3) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -464,7 +464,7 @@ define amdgpu_kernel void @use_flat_to_group_addrspacecast(ptr %ptr) #1 {
 
 define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_private_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
 ; HSA-NEXT:    store volatile i32 0, ptr addrspace(5) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -511,7 +511,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %
 
 define amdgpu_kernel void @use_flat_to_global_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_global_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1)
 ; HSA-NEXT:    store volatile i32 0, ptr addrspace(1) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -523,7 +523,7 @@ define amdgpu_kernel void @use_flat_to_global_addrspacecast(ptr %ptr) #1 {
 
 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_constant_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(4)
 ; HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -542,7 +542,7 @@ define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_shared
-; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR14:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]])
 ; ATTRIBUTOR_HSA-NEXT:    [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32
 ; ATTRIBUTOR_HSA-NEXT:    store i32 [[EXT]], ptr addrspace(1) undef, align 4
@@ -563,7 +563,7 @@ define amdgpu_kernel void @use_is_private(ptr %ptr) #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_private
-; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR14]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]])
 ; ATTRIBUTOR_HSA-NEXT:    [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32
 ; ATTRIBUTOR_HSA-NEXT:    store i32 [[EXT]], ptr addrspace(1) undef, align 4
@@ -583,7 +583,7 @@ define amdgpu_kernel void @use_alloca() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR13]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
 ; ATTRIBUTOR_HSA-NEXT:    store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -604,7 +604,7 @@ define amdgpu_kernel void @use_alloca_non_entry_block() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR13]] {
 ; ATTRIBUTOR_HSA-NEXT:  entry:
 ; ATTRIBUTOR_HSA-NEXT:    br label [[BB:%.*]]
 ; ATTRIBUTOR_HSA:       bb:
@@ -629,7 +629,7 @@ define void @use_alloca_func() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR13:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
 ; ATTRIBUTOR_HSA-NEXT:    store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -651,19 +651,21 @@ attributes #1 = { nounwind }
 ; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-stack-objects" }
 ;.
 ; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index 547ff69592ca0e..89fe46d975309a 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -398,13 +398,13 @@ attributes #1 = { nounwind }
 ; AKF_CHECK: attributes #[[ATTR1]] = { nounwind }
 ;.
 ; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
new file mode 100644
index 00000000000000..ce5a3eedb5ebb1
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
@@ -0,0 +1,1028 @@
+; Test the generation of the attribute amdgpu-no-flat-scratch-init
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -global-isel -stop-after=irtranslator < %s | FileCheck -check-prefixes=GFX10 %s
+
+;; tests of alloca
+
+define void @without_alloca(i1 %arg0) {
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @with_alloca() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_cc_kernel() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+; graphics functions won't get the attribute amdgpu-no-flat-scratch-init
+
+define amdgpu_vs void @with_alloca_cc_vs() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_gs void @with_alloca_cc_gs() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_ps void @with_alloca_cc_ps() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs void @with_alloca_cc_cs() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_hs void @with_alloca_cc_hs() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_ls void @with_alloca_cc_ls() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_es void @with_alloca_cc_es() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_gfx void @with_alloca_cc_gfx() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @with_alloca_cc_cs_chain() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define void @call_without_alloca() {
+  call void @without_alloca(i1 true)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_cc_kernel() {
+  call void @without_alloca(i1 true)
+  ret void
+}
+
+define void @call_with_alloca() {
+  call void @with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_with_alloca_cc_kernel() {
+  call void @with_alloca()
+  ret void
+}
+
+define void @call_both_with_and_without_alloca() {
+  call void @with_alloca()
+  call void @without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
+  call void @with_alloca()
+  call void @without_alloca()
+  ret void
+}
+
+define void @call_call_without_alloca() {
+  call void @call_without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
+  call void @call_without_alloca()
+  ret void
+}
+
+define void @call_call_with_alloca() {
+  call void @call_with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
+  call void @call_with_alloca()
+  ret void
+}
+
+define void @with_alloca_call_without_alloca() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @without_alloca()
+  ret void
+}
+
+define void @with_alloca_call_with_alloca() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @with_alloca()
+  ret void
+}
+
+define void @with_alloca_call_call_without_alloca() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_without_alloca()
+  ret void
+}
+
+define void @with_alloca_call_call_with_alloca() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_with_alloca()
+  ret void
+}
+
+;; tests of addrspacecast
+
+define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
+  store volatile i32 0, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
+  store volatile i32 0, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
+  %stof = addrspacecast ptr addrspace(1) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
+  %stof = addrspacecast ptr addrspace(1) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
+  store volatile i32 0, ptr addrspace(2) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
+  store volatile i32 0, ptr addrspace(2) %ptr
+  ret void
+}
+
+define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
+  %stof = addrspacecast ptr addrspace(2) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
+  %stof = addrspacecast ptr addrspace(2) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
+  store volatile i32 0, ptr addrspace(3) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
+  store volatile i32 0, ptr addrspace(3) %ptr
+  ret void
+}
+
+define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
+  %stof = addrspacecast ptr addrspace(3) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
+  %stof = addrspacecast ptr addrspace(3) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
+  store volatile i32 0, ptr addrspace(4) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
+  store volatile i32 0, ptr addrspace(4) %ptr
+  ret void
+}
+
+define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
+  %stof = addrspacecast ptr addrspace(4) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
+  %stof = addrspacecast ptr addrspace(4) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  store volatile i32 0, ptr addrspace(5) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  store volatile i32 0, ptr addrspace(5) %ptr
+  ret void
+}
+
+define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+;; tests of mixed alloca and addrspacecast
+
+define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @without_alloca(i1 true)
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @without_alloca(i1 true)
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
+  call void @without_alloca(i1 true)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+  call void @without_alloca(i1 true)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+;; tests of indirect call, intrinsics
+
+ at gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
+
+define void @with_indirect_call() {
+  %fptr = load ptr, ptr addrspace(4) @gv.fptr0
+  call void %fptr()
+  ret void
+}
+
+define amdgpu_kernel void @with_indirect_call_cc_kernel() {
+  %fptr = load ptr, ptr addrspace(4) @gv.fptr0
+  call void %fptr()
+  ret void
+}
+
+define void @call_with_indirect_call() {
+  call void @with_indirect_call()
+  ret void
+}
+
+define amdgpu_kernel void @call_with_indirect_call_cc_kernel() {
+  call void @with_indirect_call()
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+
+define void @use_intrinsic_workitem_id_x() {
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define void @call_use_intrinsic_workitem_id_x() {
+  call void @use_intrinsic_workitem_id_x()
+  ret void
+}
+
+define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
+  call void @use_intrinsic_workitem_id_x()
+  ret void
+}
+
+; GFX10: name:            without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_cc_vs
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+;
+; GFX10: name:            with_alloca_cc_gs
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr5' }
+;
+; GFX10: name:            with_alloca_cc_ps
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+;
+; GFX10: name:            with_alloca_cc_cs
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+;
+; GFX10: name:            with_alloca_cc_hs
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr5' }
+;
+; GFX10: name:            with_alloca_cc_ls
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+;
+; GFX10: name:            with_alloca_cc_es
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+;
+; GFX10: name:            with_alloca_cc_gfx
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:  psInputAddr:     0
+;
+; GFX10: name:            with_alloca_cc_cs_chain
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr48_sgpr49_sgpr50_sgpr51' }
+; GFX10-NEXT:  psInputAddr:     0
+;
+; GFX10: name:            with_alloca_cc_cs_chain_preserve
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr48_sgpr49_sgpr50_sgpr51' }
+; GFX10-NEXT:  psInputAddr:     0
+;
+; GFX10: name:            call_without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
+;
+; GFX10: name:            call_with_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_with_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            call_both_with_and_without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_both_with_and_without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            call_call_without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_call_without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
+;
+; GFX10: name:            call_call_with_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_call_with_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_call_without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_alloca_call_without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_call_with_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_alloca_call_with_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_call_call_without_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_alloca_call_call_without_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_alloca_call_call_with_alloca
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_alloca_call_call_with_alloca_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            without_global_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_global_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_global_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_global_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            without_region_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_region_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_region_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_region_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            without_group_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_group_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_group_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_group_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            without_constant_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_constant_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_constant_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_constant_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            with_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            call_without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            call_with_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_with_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            call_both_with_and_without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            call_call_without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_call_without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            call_call_with_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_call_with_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            call_call_both_with_and_without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            with_cast_call_without_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_cast_call_without_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            with_cast_call_with_private_to_flat_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_cast_call_with_private_to_flat_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            call_without_alloca_and_without_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_without_alloca_and_without_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+;
+; GFX10: name:            call_without_alloca_and_with_addrspacecast
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_without_alloca_and_with_addrspacecast_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+;
+; GFX10: name:            with_indirect_call
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            with_indirect_call_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr8_sgpr9' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_with_indirect_call
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_with_indirect_call_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr8_sgpr9' }
+; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            use_intrinsic_workitem_id_x
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            use_intrinsic_workitem_id_x_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
+;
+; GFX10: name:            call_use_intrinsic_workitem_id_x
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
+;
+; GFX10: name:            call_use_intrinsic_workitem_id_x_cc_kernel
+; GFX10:       argumentInfo:
+; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
new file mode 100644
index 00000000000000..c0d700cc37464b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -0,0 +1,914 @@
+; Test the generation of the attribute amdgpu-no-flat-scratch-init
+; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -stop-after=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s
+
+;; tests of alloca
+
+define void @without_alloca(i1 %arg0) {
+; GFX9-LABEL: define void @without_alloca(i1 %arg0)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_alloca(i1 %arg0)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define void @with_alloca() {
+; GFX9-LABEL: define void @with_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
+; GFX9-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i1 %arg0, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+; graphics functions won't get the attribute amdgpu-no-flat-scratch-init
+
+define amdgpu_vs void @with_alloca_cc_vs() {
+; GFX9-LABEL: define amdgpu_vs void @with_alloca_cc_vs()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_vs void @with_alloca_cc_vs()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_gs void @with_alloca_cc_gs() {
+; GFX9-LABEL: define amdgpu_gs void @with_alloca_cc_gs()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_gs void @with_alloca_cc_gs()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_ps void @with_alloca_cc_ps() {
+; GFX9-LABEL: define amdgpu_ps void @with_alloca_cc_ps()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_ps void @with_alloca_cc_ps()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs void @with_alloca_cc_cs() {
+; GFX9-LABEL: define amdgpu_cs void @with_alloca_cc_cs()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_cs void @with_alloca_cc_cs()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_hs void @with_alloca_cc_hs() {
+; GFX9-LABEL: define amdgpu_hs void @with_alloca_cc_hs()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_hs void @with_alloca_cc_hs()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_ls void @with_alloca_cc_ls() {
+; GFX9-LABEL: define amdgpu_ls void @with_alloca_cc_ls()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_ls void @with_alloca_cc_ls()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_es void @with_alloca_cc_es() {
+; GFX9-LABEL: define amdgpu_es void @with_alloca_cc_es()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_es void @with_alloca_cc_es()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_gfx void @with_alloca_cc_gfx() {
+; GFX9-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @with_alloca_cc_cs_chain() {
+; GFX9-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
+; GFX9-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve()
+; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve()
+; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  ret void
+}
+
+define void @call_without_alloca() {
+; GFX9-LABEL: define void @call_without_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_without_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  call void @without_alloca(i1 true)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  call void @without_alloca(i1 true)
+  ret void
+}
+
+define void @call_with_alloca() {
+; GFX9-LABEL: define void @call_with_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_with_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_with_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @with_alloca()
+  ret void
+}
+
+define void @call_both_with_and_without_alloca() {
+; GFX9-LABEL: define void @call_both_with_and_without_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_both_with_and_without_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @with_alloca()
+  call void @without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @with_alloca()
+  call void @without_alloca()
+  ret void
+}
+
+define void @call_call_without_alloca() {
+; GFX9-LABEL: define void @call_call_without_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_call_without_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  call void @call_without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+  call void @call_without_alloca()
+  ret void
+}
+
+define void @call_call_with_alloca() {
+; GFX9-LABEL: define void @call_call_with_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_call_with_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @call_with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @call_with_alloca()
+  ret void
+}
+
+define void @with_alloca_call_without_alloca() {
+; GFX9-LABEL: define void @with_alloca_call_without_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_alloca_call_without_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @without_alloca()
+  ret void
+}
+
+define void @with_alloca_call_with_alloca() {
+; GFX9-LABEL: define void @with_alloca_call_with_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_alloca_call_with_alloca()
+; GFX10-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @with_alloca()
+  ret void
+}
+
+define void @with_alloca_call_call_without_alloca() {
+; GFX9-LABEL: define void @with_alloca_call_call_without_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_alloca_call_call_without_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_without_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_without_alloca()
+  ret void
+}
+
+define void @with_alloca_call_call_with_alloca() {
+; GFX9-LABEL: define void @with_alloca_call_call_with_alloca()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_alloca_call_call_with_alloca()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_with_alloca()
+  ret void
+}
+
+define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %temp = alloca i32, addrspace(5)
+  store volatile i32 0, ptr addrspace(5) %temp, align 4
+  call void @call_with_alloca()
+  ret void
+}
+
+;; tests of addrspacecast
+
+define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i32 0, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i32 0, ptr addrspace(1) %ptr
+  ret void
+}
+
+define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(1) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(1) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
+; GFX9-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i32 0, ptr addrspace(2) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i32 0, ptr addrspace(2) %ptr
+  ret void
+}
+
+define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
+; GFX9-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(2) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(2) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i32 0, ptr addrspace(3) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i32 0, ptr addrspace(3) %ptr
+  ret void
+}
+
+define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(3) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(3) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
+; GFX9-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i32 0, ptr addrspace(4) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i32 0, ptr addrspace(4) %ptr
+  ret void
+}
+
+define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
+; GFX9-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(4) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(4) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  store volatile i32 0, ptr addrspace(5) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  store volatile i32 0, ptr addrspace(5) %ptr
+  ret void
+}
+
+define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  ret void
+}
+
+define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  %stof = addrspacecast ptr addrspace(5) %ptr to ptr
+  store volatile i32 0, ptr %stof
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+;; tests of mixed alloca and addrspacecast
+
+define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+  call void @without_alloca(i1 true)
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  call void @without_alloca(i1 true)
+  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+  call void @without_alloca(i1 true)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+  call void @without_alloca(i1 true)
+  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  ret void
+}
+
+;; tests of indirect call, intrinsics
+
+ at gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
+
+define void @with_indirect_call() {
+; GFX9-LABEL: define void @with_indirect_call()
+; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL:[0-9]+]]
+;
+; GFX10-LABEL: define void @with_indirect_call()
+; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL:[0-9]+]] {
+  %fptr = load ptr, ptr addrspace(4) @gv.fptr0
+  call void %fptr()
+  ret void
+}
+
+define amdgpu_kernel void @with_indirect_call_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL2:[0-9]+]]
+  %fptr = load ptr, ptr addrspace(4) @gv.fptr0
+  call void %fptr()
+  ret void
+}
+
+define void @call_with_indirect_call() {
+; GFX9-LABEL: define void @call_with_indirect_call()
+; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_with_indirect_call()
+; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL:[0-9]+]]
+  call void @with_indirect_call()
+  ret void
+}
+
+define amdgpu_kernel void @call_with_indirect_call_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL2:[0-9]+]]
+  call void @with_indirect_call()
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+
+define void @use_intrinsic_workitem_id_x() {
+; GFX9-LABEL: define void @use_intrinsic_workitem_id_x()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+;
+; GFX10-LABEL: define void @use_intrinsic_workitem_id_x()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+  %val = call i32 @llvm.amdgcn.workitem.id.x()
+  store volatile i32 %val, ptr addrspace(1) undef
+  ret void
+}
+
+define void @call_use_intrinsic_workitem_id_x() {
+; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+;
+; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+  call void @use_intrinsic_workitem_id_x()
+  ret void
+}
+
+define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
+; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+  call void @use_intrinsic_workitem_id_x()
+  ret void
+}
+
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+;GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_IND_CALL]]  = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
index 90562e25a3e9c1..470c444ad8cd74 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
@@ -23,7 +23,7 @@
 ; COV5: .amdhsa_user_sgpr_queue_ptr 0
 ; NOOPT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; NOOPT: .amdhsa_user_sgpr_dispatch_id 1
-; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0
+; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 1
 ; NOOPT: .amdhsa_user_sgpr_private_segment_size 0
 ; NOOPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
 ; NOOPT: .amdhsa_system_sgpr_workgroup_id_x 1
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 6af45035d394f8..810a227152d2a9 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -33,8 +33,8 @@ define void @indirect_use_vcc() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
-; CI: ; NumSgprs: 38
-; VI-NOBUG: ; NumSgprs: 40
+; CI: ; NumSgprs: 36
+; VI-NOBUG: ; NumSgprs: 36
 ; VI-BUG: ; NumSgprs: 96
 ; GCN: ; NumVgprs: 41
 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
@@ -121,8 +121,8 @@ define void @indirect_use_80_sgpr() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
-; CI: ; NumSgprs: 84
-; VI-NOBUG: ; NumSgprs: 86
+; CI: ; NumSgprs: 82
+; VI-NOBUG: ; NumSgprs: 82
 ; VI-BUG: ; NumSgprs: 96
 define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
   call void @indirect_use_80_sgpr()
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 032ec65fa85133..1511988ad61c19 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -192,11 +192,11 @@ define hidden void @use_workgroup_id_yz() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x:
 ; GCN-NOT: s6
-; GCN: s_mov_b32 s12, s6
+; GCN: s_mov_b32 s12, s4
 ; GCN: s_mov_b32 s32, 0
-; GCN: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, use_workgroup_id_x at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, use_workgroup_id_x at rel32@hi+12
+; GCN: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x at rel32@hi+12
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_endpgm
 
@@ -210,7 +210,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y:
 ; GCN-NOT: s12
-; GCN: s_mov_b32 s13, s7
+; GCN: s_mov_b32 s13, s5
 ; GCN-NOT: s12
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -226,7 +226,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 {
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z:
 ; GCN-NOT: s12
 ; GCN-NOT: s13
-; GCN: s_mov_b32 s14, s7
+; GCN: s_mov_b32 s14, s5
 ; GCN-NOT: s12
 ; GCN-NOT: s13
 
@@ -243,8 +243,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy:
 ; GCN-NOT: s14
-; GCN: s_mov_b32 s12, s6
-; GCN-NEXT: s_mov_b32 s13, s7
+; GCN: s_mov_b32 s12, s4
+; GCN-NEXT: s_mov_b32 s13, s5
 ; GCN-NOT: s14
 
 ; GCN: s_mov_b32 s32, 0
@@ -259,9 +259,9 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 {
 }
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz:
-; GCN: s_mov_b32 s12, s6
-; GCN: s_mov_b32 s13, s7
-; GCN: s_mov_b32 s14, s8
+; GCN: s_mov_b32 s12, s4
+; GCN: s_mov_b32 s13, s5
+; GCN: s_mov_b32 s14, s6
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 
@@ -276,8 +276,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 {
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz:
 
 ; GCN-NOT: s13
-; GCN: s_mov_b32 s12, s6
-; GCN-NEXT: s_mov_b32 s14, s7
+; GCN: s_mov_b32 s12, s4
+; GCN-NEXT: s_mov_b32 s14, s5
 ; GCN-NOT: s13
 
 ; GCN: s_mov_b32 s32, 0
@@ -293,8 +293,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz:
 
-; GCN: s_mov_b32 s13, s7
-; GCN: s_mov_b32 s14, s8
+; GCN: s_mov_b32 s13, s5
+; GCN: s_mov_b32 s14, s6
 
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -375,7 +375,7 @@ define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
 
 ; GCN-NOT: s13
 ; GCN-NOT: s14
-; GCN-DAG: s_mov_b32 s12, s6
+; GCN-DAG: s_mov_b32 s12, s4
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
 ; GCN-NOT: s13
 ; GCN-NOT: s14
@@ -393,7 +393,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y:
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: s_mov_b32 s13, s7
+; GCN-DAG: s_mov_b32 s13, s5
 
 ; GCN-DAG: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -408,7 +408,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z:
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: s_mov_b32 s14, s7
+; GCN-DAG: s_mov_b32 s14, s5
 
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
index aa182b720c6042..ebca9906998787 100644
--- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
@@ -35,6 +35,6 @@ define amdgpu_kernel void @test_direct_indirect_call() {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index 848019c8729251..074dba1cbcc931 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -42,6 +42,6 @@ attributes #0 = { "amdgpu-no-dispatch-id" }
 ;.
 ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" }
 ;.
-; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
index 0ad53083d0ff3f..eb3d356ea59b44 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -20,7 +20,7 @@ define amdgpu_kernel void @store_flat_i32(ptr addrspace(1) %gptr, i32 %x) #0 {
 }
 
 ; GCN-LABEL: {{^}}store_flat_i64:
-; GCN: flat_store_{{dwordx2|b64}}
+; GCN: flat_store_{{dword|b64}}
 define amdgpu_kernel void @store_flat_i64(ptr addrspace(1) %gptr, i64 %x) #0 {
   %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
   store volatile i64 %x, ptr %fptr, align 8
@@ -28,7 +28,7 @@ define amdgpu_kernel void @store_flat_i64(ptr addrspace(1) %gptr, i64 %x) #0 {
 }
 
 ; GCN-LABEL: {{^}}store_flat_v4i32:
-; GCN: flat_store_{{dwordx4|b128}}
+; GCN: flat_store_{{dword|b128}}
 define amdgpu_kernel void @store_flat_v4i32(ptr addrspace(1) %gptr, <4 x i32> %x) #0 {
   %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
   store volatile <4 x i32> %x, ptr %fptr, align 16
@@ -65,7 +65,7 @@ define amdgpu_kernel void @load_flat_i32(ptr addrspace(1) noalias %out, ptr addr
 }
 
 ; GCN-LABEL: load_flat_i64:
-; GCN: flat_load_{{dwordx2|b64}}
+; GCN: flat_load_{{dword|b64}}
 define amdgpu_kernel void @load_flat_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
   %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
   %fload = load volatile i64, ptr %fptr, align 8
@@ -74,7 +74,7 @@ define amdgpu_kernel void @load_flat_i64(ptr addrspace(1) noalias %out, ptr addr
 }
 
 ; GCN-LABEL: load_flat_v4i32:
-; GCN: flat_load_{{dwordx4|b128}}
+; GCN: flat_load_{{dword|b128}}
 define amdgpu_kernel void @load_flat_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %gptr) #0 {
   %fptr = addrspacecast ptr addrspace(1) %gptr to ptr
   %fload = load volatile <4 x i32>, ptr %fptr, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index b89dbd42e0466f..c998a4b19121e1 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -10,9 +10,11 @@
 define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
 ; GFX8V4-LABEL: addrspacecast:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x40
-; GFX8V4-NEXT:    v_mov_b32_e32 v4, 1
+; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x40
+; GFX8V4-NEXT:    s_add_i32 s8, s8, s11
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s9
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V4-NEXT:    s_cselect_b32 s3, s3, 0
@@ -22,6 +24,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8V4-NEXT:    s_cselect_b32 s0, s2, 0
 ; GFX8V4-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX8V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v4
@@ -33,9 +36,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX8V5-LABEL: addrspacecast:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0xc8
-; GFX8V5-NEXT:    v_mov_b32_e32 v4, 1
+; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xc8
+; GFX8V5-NEXT:    s_add_i32 s6, s6, s9
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s7
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V5-NEXT:    s_cselect_b32 s2, s2, 0
@@ -45,6 +50,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8V5-NEXT:    s_cselect_b32 s0, s3, 0
 ; GFX8V5-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX8V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v4
@@ -56,10 +62,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX9V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX9V4-NEXT:    s_cselect_b32 s2, s3, 0
@@ -69,6 +76,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9V4-NEXT:    s_cselect_b32 s0, s5, 0
 ; GFX9V4-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v4
@@ -80,10 +88,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
+; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
 ; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
-; GFX9V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX9V5-NEXT:    s_cselect_b32 s2, s3, 0
@@ -93,6 +102,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9V5-NEXT:    s_cselect_b32 s0, s5, 0
 ; GFX9V5-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v4
@@ -101,6 +111,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-NEXT:    flat_store_dword v[2:3], v0
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9V5-NEXT:    s_endpgm
+
   %flat.private = addrspacecast ptr addrspace(5) %ptr.private to ptr
   %flat.local = addrspacecast ptr addrspace(3) %ptr.local to ptr
   store volatile i32 1, ptr %flat.private
diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
index 4c21f872974559..5979f1bbacdd14 100644
--- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
@@ -258,25 +258,25 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo
 
 ;.
 ; V4: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V4: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
 ;.
 ; V5: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V5: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
 ;.
 ; V6: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V6: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
 ;.
 ; V4: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index b49931379b84a5..ae2beab4422ea4 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -30,7 +30,7 @@ define hidden void @func() #1 {
 ; GCN-NOT: writelane
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 
-; GCN: ; NumSgprs: 37
+; GCN: ; NumSgprs: 33
 ; GCN: ; NumVgprs: 9
 define amdgpu_kernel void @kernel_call() #0 {
   %vgpr = load volatile i32, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index d51ace630f6925..aebc8315514fbc 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -113,24 +113,16 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) {
 define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
-; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
+; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -176,24 +168,16 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) {
 define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
-; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
+; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -239,24 +223,16 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) {
 define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
-; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
+; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -302,24 +278,16 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx)
 define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[16:17]
-; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
-; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
+; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -352,12 +320,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -371,8 +334,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 0
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v4, v3
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -385,24 +348,15 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    s_mov_b32 s15, 4
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -429,12 +383,7 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id
 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -448,8 +397,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 2
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v4, v3
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -462,24 +411,15 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    s_mov_b32 s15, 6
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -506,12 +446,7 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32
 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -525,8 +460,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 1
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v4, v3
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -539,24 +474,15 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    s_mov_b32 s15, 5
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -583,12 +509,7 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32
 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -602,8 +523,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 3
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
-; CHECK-NEXT:    ds_write_b16 v4, v3
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    ds_write_b16 v0, v1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -616,24 +537,15 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s10, s15
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_addc_u32 s11, s11, 0
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s7
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s6, 8
-; CHECK-NEXT:    s_addc_u32 s9, s7, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; CHECK-NEXT:    s_mov_b32 s15, 7
-; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    s_add_u32 s8, s4, 8
+; CHECK-NEXT:    s_addc_u32 s9, s5, 0
+; CHECK-NEXT:    s_getpc_b64 s[4:5]
+; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index c201f84cac7268..8fadfe3d026669 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -42,28 +42,19 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
 ; GCN-LABEL: indirect_lds_id:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
-; GCN-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s8, s6, 8
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; GCN-NEXT:    s_addc_u32 s9, s7, 0
-; GCN-NEXT:    s_getpc_b64 s[6:7]
-; GCN-NEXT:    s_add_u32 s6, s6, function_lds_id at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s7, s7, function_lds_id at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_or_b32_e32 v31, v0, v2
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_getpc_b64 s[8:9]
+; GCN-NEXT:    s_add_u32 s8, s8, function_lds_id at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s9, s9, function_lds_id at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
 ; GCN-NEXT:    s_mov_b32 s15, 21
+; GCN-NEXT:    s_mov_b32 s12, s6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NEXT:    v_mov_b32_e32 v1, s17
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GCN-NEXT:    s_endpgm
   call void @function_lds_id(ptr addrspace(1) %out)
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index 9bbcc6988e311f..05ad567478675c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -161,10 +161,7 @@ define amdgpu_kernel void @k01() {
 ; GCN-LABEL: k01:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -200,10 +197,7 @@ define amdgpu_kernel void @k23() {
 ; GCN-LABEL: k23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -250,10 +244,7 @@ define amdgpu_kernel void @k123() {
 ; GCN-LABEL: k123:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index 72a0aceaae12b6..3453ff9d296c09 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -226,10 +226,7 @@ define amdgpu_kernel void @k01() {
 ; GCN-LABEL: k01:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -273,10 +270,7 @@ define amdgpu_kernel void @k23() {
 ; GCN-LABEL: k23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -323,10 +317,7 @@ define amdgpu_kernel void @k123() {
 ; GCN-LABEL: k123:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s7
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 45e8b3bcff13c5..b201e6ddd7a5b7 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_agent_unordered_load(
 ; GFX7-LABEL: flat_agent_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -200,6 +215,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_load(
 ; GFX7-LABEL: flat_agent_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -215,6 +233,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -231,6 +253,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -262,6 +288,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -276,6 +304,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -385,6 +415,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX7-LABEL: flat_agent_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -401,6 +434,10 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -419,6 +456,10 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -452,6 +493,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -467,6 +510,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -589,6 +634,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX7-LABEL: flat_agent_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -606,6 +654,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -626,6 +678,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -662,6 +718,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -678,6 +736,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -815,6 +875,9 @@ entry:
 define amdgpu_kernel void @flat_agent_unordered_store(
 ; GFX7-LABEL: flat_agent_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -826,6 +889,10 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -838,6 +905,10 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -861,6 +932,8 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -872,6 +945,8 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -955,6 +1030,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_store(
 ; GFX7-LABEL: flat_agent_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -966,6 +1044,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -978,6 +1060,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1001,6 +1087,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1012,6 +1100,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1095,6 +1185,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_store(
 ; GFX7-LABEL: flat_agent_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1107,6 +1200,10 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1121,6 +1218,10 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1147,6 +1248,8 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1159,6 +1262,8 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1261,6 +1366,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_store(
 ; GFX7-LABEL: flat_agent_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1273,6 +1381,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1287,6 +1399,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1313,6 +1429,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1325,6 +1443,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1427,6 +1547,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1439,6 +1562,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1452,6 +1579,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1477,6 +1608,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1489,6 +1622,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1579,6 +1714,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX7-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1593,6 +1731,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1610,6 +1752,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1640,6 +1786,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1654,6 +1802,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1762,6 +1912,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ; GFX7-LABEL: flat_agent_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1775,6 +1928,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1790,6 +1947,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1818,6 +1979,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1831,6 +1994,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1940,6 +2105,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1955,6 +2123,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1974,6 +2146,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2007,6 +2183,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2022,6 +2200,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2149,6 +2329,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2164,6 +2347,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2183,6 +2370,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2216,6 +2407,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2231,6 +2424,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2358,6 +2553,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2374,6 +2572,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2392,6 +2594,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2425,6 +2631,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2440,6 +2648,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2562,6 +2772,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2579,6 +2792,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2599,6 +2816,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2635,6 +2856,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2651,6 +2874,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2792,6 +3017,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2809,6 +3037,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2829,6 +3061,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2865,6 +3101,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2881,6 +3119,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3022,6 +3262,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3047,6 +3290,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3072,6 +3319,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3122,6 +3373,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3137,6 +3390,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3253,6 +3508,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3280,6 +3538,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3309,6 +3571,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3364,6 +3630,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3381,6 +3649,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3515,6 +3785,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3541,6 +3814,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3568,6 +3845,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3621,6 +3902,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3637,6 +3920,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3772,6 +4057,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3800,6 +4088,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3831,6 +4123,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3889,6 +4185,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3907,6 +4205,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4060,6 +4360,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4088,6 +4391,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4119,6 +4426,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4177,6 +4488,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4195,6 +4508,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4348,6 +4663,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4375,6 +4693,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4404,6 +4726,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4459,6 +4785,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4476,6 +4804,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4610,6 +4940,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4637,6 +4970,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4666,6 +5003,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4721,6 +5062,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4738,6 +5081,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4872,6 +5217,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4900,6 +5248,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4931,6 +5283,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4989,6 +5345,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5007,6 +5365,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5160,6 +5520,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5188,6 +5551,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5219,6 +5586,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5277,6 +5648,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5295,6 +5668,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5448,6 +5823,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5476,6 +5854,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5507,6 +5889,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5565,6 +5951,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5583,6 +5971,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5736,6 +6126,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5764,6 +6157,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5795,6 +6192,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5853,6 +6254,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5871,6 +6274,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6024,6 +6429,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6052,6 +6460,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6083,6 +6495,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6141,6 +6557,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6159,6 +6577,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6312,6 +6732,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6340,6 +6763,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6371,6 +6798,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6429,6 +6860,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6447,6 +6880,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6600,6 +7035,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6628,6 +7066,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6659,6 +7101,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6717,6 +7163,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6735,6 +7183,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6888,6 +7338,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6916,6 +7369,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6947,6 +7404,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7005,6 +7466,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7023,6 +7486,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7176,6 +7641,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7204,6 +7672,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7232,6 +7704,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7288,6 +7764,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7306,6 +7784,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7449,6 +7929,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7478,6 +7961,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7508,6 +7995,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7566,6 +8057,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7585,6 +8078,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7741,6 +8236,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7770,6 +8268,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7800,6 +8302,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7859,6 +8365,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7878,6 +8386,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8040,6 +8550,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8070,6 +8583,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8102,6 +8619,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8163,6 +8684,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8183,6 +8706,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8358,6 +8883,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8388,6 +8916,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8420,6 +8952,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8481,6 +9017,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8501,6 +9039,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8676,6 +9216,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8705,6 +9248,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8735,6 +9282,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8793,6 +9344,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8812,6 +9365,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8968,6 +9523,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8997,6 +9555,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9027,6 +9589,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9085,6 +9651,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9104,6 +9672,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9260,6 +9830,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9290,6 +9863,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9322,6 +9899,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9383,6 +9964,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9403,6 +9986,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9578,6 +10163,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9608,6 +10196,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9640,6 +10232,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9701,6 +10297,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9721,6 +10319,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9896,6 +10496,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9926,6 +10529,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9958,6 +10565,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10019,6 +10630,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10039,6 +10652,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10214,6 +10829,9 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10244,6 +10862,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10276,6 +10898,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10337,6 +10963,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10357,6 +10985,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10532,6 +11162,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10562,6 +11195,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10594,6 +11231,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10655,6 +11296,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10675,6 +11318,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10850,6 +11495,9 @@ entry:
 define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10880,6 +11528,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10912,6 +11564,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10973,6 +11629,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10993,6 +11651,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11168,6 +11828,9 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11198,6 +11861,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11230,6 +11897,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11291,6 +11962,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11311,6 +11984,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11486,6 +12161,9 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11516,6 +12194,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11548,6 +12230,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11609,6 +12295,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11629,6 +12317,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11804,6 +12494,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ; GFX7-LABEL: flat_agent_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11819,6 +12512,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11835,6 +12532,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11866,6 +12567,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11880,6 +12583,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11989,6 +12694,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12004,6 +12712,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12020,6 +12732,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12051,6 +12767,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12065,6 +12783,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12174,6 +12894,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX7-LABEL: flat_agent_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12191,6 +12914,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12210,6 +12937,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12245,6 +12976,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12261,6 +12994,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12388,6 +13123,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12406,6 +13144,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12427,6 +13169,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12465,6 +13211,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12482,6 +13230,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12624,6 +13374,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ; GFX7-LABEL: flat_agent_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12635,6 +13388,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12647,6 +13404,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12670,6 +13431,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12681,6 +13444,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12764,6 +13529,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12775,6 +13543,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12787,6 +13559,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12810,6 +13586,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12821,6 +13599,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12904,6 +13684,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_store(
 ; GFX7-LABEL: flat_agent_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12916,6 +13699,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12930,6 +13717,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12956,6 +13747,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12968,6 +13761,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13070,6 +13865,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13082,6 +13880,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13096,6 +13898,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13122,6 +13928,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13134,6 +13942,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13236,6 +14046,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13248,6 +14061,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13261,6 +14078,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13286,6 +14107,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13298,6 +14121,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13388,6 +14213,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13402,6 +14230,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13418,6 +14250,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13447,6 +14283,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13461,6 +14299,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13567,6 +14407,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13580,6 +14423,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13595,6 +14442,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13623,6 +14474,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13636,6 +14489,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13745,6 +14600,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13760,6 +14618,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13778,6 +14640,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13810,6 +14676,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13825,6 +14693,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13950,6 +14820,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13965,6 +14838,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13983,6 +14860,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14015,6 +14896,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14030,6 +14913,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14155,6 +15040,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14172,6 +15060,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14191,6 +15083,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14226,6 +15122,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14242,6 +15140,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14369,6 +15269,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14387,6 +15290,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14408,6 +15315,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14446,6 +15357,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14463,6 +15376,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14609,6 +15524,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14627,6 +15545,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14648,6 +15570,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14686,6 +15612,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14703,6 +15631,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14849,6 +15779,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14874,6 +15807,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14899,6 +15836,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14949,6 +15890,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14964,6 +15907,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15080,6 +16025,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15107,6 +16055,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15135,6 +16087,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15189,6 +16145,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15206,6 +16164,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15338,6 +16298,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15364,6 +16327,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15391,6 +16358,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15444,6 +16415,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15460,6 +16433,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15595,6 +16570,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15623,6 +16601,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15653,6 +16635,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15710,6 +16696,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15728,6 +16716,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15879,6 +16869,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15907,6 +16900,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15937,6 +16934,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15994,6 +16995,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16012,6 +17015,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16163,6 +17168,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16190,6 +17198,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16218,6 +17230,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16272,6 +17288,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16289,6 +17307,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16421,6 +17441,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16448,6 +17471,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16476,6 +17503,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16530,6 +17561,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16547,6 +17580,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16679,6 +17714,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16707,6 +17745,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16737,6 +17779,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16794,6 +17840,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16812,6 +17860,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16963,6 +18013,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16991,6 +18044,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17021,6 +18078,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17078,6 +18139,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17096,6 +18159,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17247,6 +18312,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17275,6 +18343,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17305,6 +18377,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17362,6 +18438,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17380,6 +18458,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17531,6 +18611,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17559,6 +18642,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17589,6 +18676,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17646,6 +18737,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17664,6 +18757,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17815,6 +18910,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17843,6 +18941,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17873,6 +18975,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17930,6 +19036,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17948,6 +19056,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18099,6 +19209,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18127,6 +19240,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18157,6 +19274,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18214,6 +19335,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18232,6 +19355,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18383,6 +19508,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18411,6 +19539,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18441,6 +19573,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18498,6 +19634,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18516,6 +19654,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18667,6 +19807,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18695,6 +19838,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18725,6 +19872,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18782,6 +19933,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18800,6 +19953,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18951,6 +20106,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18979,6 +20137,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19007,6 +20169,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19063,6 +20229,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19081,6 +20249,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19224,6 +20394,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19254,6 +20427,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19285,6 +20462,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19345,6 +20526,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19365,6 +20548,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19526,6 +20711,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19555,6 +20743,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19585,6 +20777,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19644,6 +20840,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19663,6 +20861,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19825,6 +21025,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19856,6 +21059,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19889,6 +21096,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19952,6 +21163,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19973,6 +21186,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20153,6 +21368,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20184,6 +21402,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20217,6 +21439,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20280,6 +21506,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20301,6 +21529,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20481,6 +21711,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20511,6 +21744,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20542,6 +21779,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20602,6 +21843,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20622,6 +21865,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20783,6 +22028,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20813,6 +22061,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20844,6 +22096,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20904,6 +22160,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20924,6 +22182,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21085,6 +22345,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21116,6 +22379,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21149,6 +22416,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21212,6 +22483,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21233,6 +22506,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21413,6 +22688,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21444,6 +22722,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21477,6 +22759,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21540,6 +22826,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21561,6 +22849,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21741,6 +23031,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21772,6 +23065,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21805,6 +23102,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21868,6 +23169,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21889,6 +23192,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22069,6 +23374,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22100,6 +23408,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22133,6 +23445,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22196,6 +23512,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22217,6 +23535,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22397,6 +23717,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22428,6 +23751,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22461,6 +23788,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22524,6 +23855,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22545,6 +23878,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22725,6 +24060,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22756,6 +24094,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22789,6 +24131,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22852,6 +24198,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22873,6 +24221,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23053,6 +24403,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23084,6 +24437,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23117,6 +24474,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23180,6 +24541,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23201,6 +24564,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23381,6 +24746,9 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23412,6 +24780,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23445,6 +24817,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23508,6 +24884,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23529,6 +24907,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 5fa8e6891bafb1..5a89e1269526bd 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX7-LABEL: flat_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -200,6 +215,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX7-LABEL: flat_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -228,6 +246,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
@@ -255,6 +277,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_mov_b32 s6, 2
@@ -310,6 +336,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
@@ -339,6 +367,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
@@ -539,6 +569,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX7-LABEL: flat_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -554,6 +587,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -570,6 +607,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -601,6 +642,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -615,6 +658,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -724,6 +769,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX7-LABEL: flat_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -752,6 +800,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -780,6 +832,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -836,6 +892,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -866,6 +924,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1067,6 +1127,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ; GFX7-LABEL: flat_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -1083,6 +1146,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1100,6 +1167,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1133,6 +1204,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1148,6 +1221,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 4c9ce15211e34c..f39801b6b7ade9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_singlethread_unordered_load(
 ; GFX7-LABEL: flat_singlethread_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -200,6 +215,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ; GFX7-LABEL: flat_singlethread_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -215,6 +233,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -231,6 +253,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -262,6 +288,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -276,6 +304,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -385,6 +415,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_load(
 ; GFX7-LABEL: flat_singlethread_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -400,6 +433,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -416,6 +453,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -447,6 +488,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -461,6 +504,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -570,6 +615,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ; GFX7-LABEL: flat_singlethread_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -585,6 +633,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -601,6 +653,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -632,6 +688,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -646,6 +704,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -755,6 +815,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_unordered_store(
 ; GFX7-LABEL: flat_singlethread_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -766,6 +829,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -778,6 +845,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -801,6 +872,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -812,6 +885,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -895,6 +970,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ; GFX7-LABEL: flat_singlethread_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -906,6 +984,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -918,6 +1000,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -941,6 +1027,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -952,6 +1040,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1035,6 +1125,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_store(
 ; GFX7-LABEL: flat_singlethread_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1046,6 +1139,10 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1058,6 +1155,10 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1081,6 +1182,8 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1092,6 +1195,8 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1175,6 +1280,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ; GFX7-LABEL: flat_singlethread_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1186,6 +1294,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1198,6 +1310,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1221,6 +1337,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1232,6 +1350,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1315,6 +1435,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1327,6 +1450,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1340,6 +1467,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1365,6 +1496,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1377,6 +1510,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1467,6 +1602,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1479,6 +1617,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1492,6 +1634,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1517,6 +1663,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1529,6 +1677,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1619,6 +1769,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1631,6 +1784,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1644,6 +1801,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1669,6 +1830,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1681,6 +1844,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1771,6 +1936,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1783,6 +1951,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1796,6 +1968,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1821,6 +1997,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1833,6 +2011,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1923,6 +2103,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1935,6 +2118,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1948,6 +2135,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1973,6 +2164,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1985,6 +2178,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2075,6 +2270,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2090,6 +2288,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2106,6 +2308,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2137,6 +2343,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2151,6 +2359,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2260,6 +2470,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2275,6 +2488,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2291,6 +2508,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2322,6 +2543,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2336,6 +2559,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2445,6 +2670,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2460,6 +2688,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2476,6 +2708,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2507,6 +2743,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2521,6 +2759,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2630,6 +2870,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2655,6 +2898,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2680,6 +2927,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2730,6 +2981,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2745,6 +2998,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2861,6 +3116,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2886,6 +3144,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2911,6 +3173,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2961,6 +3227,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2976,6 +3244,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3092,6 +3362,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3117,6 +3390,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3142,6 +3419,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3192,6 +3473,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3207,6 +3490,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3323,6 +3608,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3348,6 +3636,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3373,6 +3665,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3423,6 +3719,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3438,6 +3736,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3554,6 +3854,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3579,6 +3882,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3604,6 +3911,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3654,6 +3965,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3669,6 +3982,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3785,6 +4100,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3810,6 +4128,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3835,6 +4157,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3885,6 +4211,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3900,6 +4228,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4016,6 +4346,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4041,6 +4374,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4066,6 +4403,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4116,6 +4457,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4131,6 +4474,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4247,6 +4592,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4272,6 +4620,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4297,6 +4649,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4347,6 +4703,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4362,6 +4720,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4478,6 +4838,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4503,6 +4866,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4528,6 +4895,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4578,6 +4949,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4593,6 +4966,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4709,6 +5084,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4734,6 +5112,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4759,6 +5141,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4809,6 +5195,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4824,6 +5212,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4940,6 +5330,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4965,6 +5358,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4990,6 +5387,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5040,6 +5441,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5055,6 +5458,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5171,6 +5576,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5196,6 +5604,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5221,6 +5633,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5271,6 +5687,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5286,6 +5704,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5402,6 +5822,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5427,6 +5850,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5452,6 +5879,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5502,6 +5933,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5517,6 +5950,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5633,6 +6068,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5658,6 +6096,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5683,6 +6125,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5733,6 +6179,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5748,6 +6196,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5864,6 +6314,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5889,6 +6342,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5914,6 +6371,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5964,6 +6425,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5979,6 +6442,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6095,6 +6560,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6123,6 +6591,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6151,6 +6623,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6207,6 +6683,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6225,6 +6703,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6368,6 +6848,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6396,6 +6879,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6424,6 +6911,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6480,6 +6971,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6498,6 +6991,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6641,6 +7136,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6669,6 +7167,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6697,6 +7199,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6753,6 +7259,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6771,6 +7279,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6914,6 +7424,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6942,6 +7455,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6970,6 +7487,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7026,6 +7547,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7044,6 +7567,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7187,6 +7712,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7215,6 +7743,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7243,6 +7775,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7299,6 +7835,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7317,6 +7855,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7460,6 +8000,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7488,6 +8031,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7516,6 +8063,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7572,6 +8123,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7590,6 +8143,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7733,6 +8288,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7761,6 +8319,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7789,6 +8351,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7845,6 +8411,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7863,6 +8431,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8006,6 +8576,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8034,6 +8607,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8062,6 +8639,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8118,6 +8699,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8136,6 +8719,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8279,6 +8864,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8307,6 +8895,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8335,6 +8927,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8391,6 +8987,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8409,6 +9007,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8552,6 +9152,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8580,6 +9183,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8608,6 +9215,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8664,6 +9275,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8682,6 +9295,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8825,6 +9440,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8853,6 +9471,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8881,6 +9503,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8937,6 +9563,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8955,6 +9583,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9098,6 +9728,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9126,6 +9759,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9154,6 +9791,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9210,6 +9851,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9228,6 +9871,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9371,6 +10016,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9399,6 +10047,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9427,6 +10079,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9483,6 +10139,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9501,6 +10159,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9644,6 +10304,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9672,6 +10335,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9700,6 +10367,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9756,6 +10427,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9774,6 +10447,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9917,6 +10592,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9945,6 +10623,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9973,6 +10655,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10029,6 +10715,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10047,6 +10735,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10190,6 +10880,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ; GFX7-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10205,6 +10898,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10221,6 +10918,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10252,6 +10953,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10266,6 +10969,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10375,6 +11080,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10390,6 +11098,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10406,6 +11118,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10437,6 +11153,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10451,6 +11169,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10560,6 +11280,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10575,6 +11298,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10591,6 +11318,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10622,6 +11353,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10636,6 +11369,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10745,6 +11480,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10760,6 +11498,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10776,6 +11518,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10807,6 +11553,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10821,6 +11569,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10930,6 +11680,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ; GFX7-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10941,6 +11694,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -10953,6 +11710,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -10976,6 +11737,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -10987,6 +11750,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11070,6 +11835,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11081,6 +11849,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11093,6 +11865,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11116,6 +11892,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11127,6 +11905,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11210,6 +11990,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ; GFX7-LABEL: flat_singlethread_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11221,6 +12004,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11233,6 +12020,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11256,6 +12047,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11267,6 +12060,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11350,6 +12145,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11361,6 +12159,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11373,6 +12175,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11396,6 +12202,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11407,6 +12215,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11490,6 +12300,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11502,6 +12315,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11515,6 +12332,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11540,6 +12361,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11552,6 +12375,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11642,6 +12467,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11654,6 +12482,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11667,6 +12499,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11692,6 +12528,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11704,6 +12542,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11794,6 +12634,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11806,6 +12649,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11819,6 +12666,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11844,6 +12695,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11856,6 +12709,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11946,6 +12801,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11958,6 +12816,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11971,6 +12833,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11996,6 +12862,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12008,6 +12876,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12098,6 +12968,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12110,6 +12983,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12123,6 +13000,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12148,6 +13029,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12160,6 +13043,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12250,6 +13135,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12265,6 +13153,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12281,6 +13173,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12312,6 +13208,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12326,6 +13224,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12435,6 +13335,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12450,6 +13353,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12466,6 +13373,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12497,6 +13408,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12511,6 +13424,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12620,6 +13535,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12635,6 +13553,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12651,6 +13573,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12682,6 +13608,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12696,6 +13624,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12805,6 +13735,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -12830,6 +13763,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -12855,6 +13792,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -12905,6 +13846,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12920,6 +13863,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13036,6 +13981,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13061,6 +14009,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13086,6 +14038,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13136,6 +14092,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13151,6 +14109,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13267,6 +14227,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13292,6 +14255,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13317,6 +14284,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13367,6 +14338,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13382,6 +14355,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13498,6 +14473,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13523,6 +14501,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13548,6 +14530,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13598,6 +14584,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13613,6 +14601,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13729,6 +14719,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13754,6 +14747,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13779,6 +14776,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13829,6 +14830,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13844,6 +14847,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13960,6 +14965,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13985,6 +14993,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14010,6 +15022,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14060,6 +15076,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14075,6 +15093,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14191,6 +15211,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14216,6 +15239,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14241,6 +15268,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14291,6 +15322,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14306,6 +15339,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14422,6 +15457,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14447,6 +15485,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14472,6 +15514,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14522,6 +15568,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14537,6 +15585,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14653,6 +15703,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14678,6 +15731,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14703,6 +15760,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14753,6 +15814,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14768,6 +15831,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14884,6 +15949,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14909,6 +15977,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14934,6 +16006,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14984,6 +16060,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14999,6 +16077,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15115,6 +16195,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15140,6 +16223,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15165,6 +16252,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15215,6 +16306,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15230,6 +16323,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15346,6 +16441,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15371,6 +16469,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15396,6 +16498,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15446,6 +16552,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15461,6 +16569,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15577,6 +16687,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15602,6 +16715,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15627,6 +16744,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15677,6 +16798,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15692,6 +16815,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15808,6 +16933,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15833,6 +16961,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15858,6 +16990,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15908,6 +17044,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15923,6 +17061,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16039,6 +17179,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16064,6 +17207,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16089,6 +17236,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16139,6 +17290,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16154,6 +17307,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16270,6 +17425,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16298,6 +17456,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16326,6 +17488,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16382,6 +17548,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16400,6 +17568,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16543,6 +17713,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16571,6 +17744,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16599,6 +17776,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16655,6 +17836,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16673,6 +17856,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16816,6 +18001,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16844,6 +18032,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16872,6 +18064,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16928,6 +18124,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16946,6 +18144,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17089,6 +18289,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17117,6 +18320,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17145,6 +18352,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17201,6 +18412,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17219,6 +18432,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17362,6 +18577,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17390,6 +18608,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17418,6 +18640,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17474,6 +18700,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17492,6 +18720,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17635,6 +18865,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17663,6 +18896,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17691,6 +18928,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17747,6 +18988,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17765,6 +19008,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17908,6 +19153,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17936,6 +19184,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17964,6 +19216,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18020,6 +19276,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18038,6 +19296,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18181,6 +19441,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18209,6 +19472,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18237,6 +19504,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18293,6 +19564,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18311,6 +19584,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18454,6 +19729,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18482,6 +19760,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18510,6 +19792,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18566,6 +19852,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18584,6 +19872,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18727,6 +20017,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18755,6 +20048,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18783,6 +20080,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18839,6 +20140,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18857,6 +20160,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19000,6 +20305,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19028,6 +20336,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19056,6 +20368,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19112,6 +20428,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19130,6 +20448,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19273,6 +20593,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19301,6 +20624,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19329,6 +20656,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19385,6 +20716,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19403,6 +20736,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19546,6 +20881,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19574,6 +20912,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19602,6 +20944,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19658,6 +21004,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19676,6 +21024,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19819,6 +21169,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19847,6 +21200,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19875,6 +21232,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19931,6 +21292,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19949,6 +21312,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20092,6 +21457,9 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20120,6 +21488,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20148,6 +21520,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20204,6 +21580,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20222,6 +21600,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index e77f1432c1c9d0..ae359090ea5f35 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_system_unordered_load(
 ; GFX7-LABEL: flat_system_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_system_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -200,6 +215,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_load(
 ; GFX7-LABEL: flat_system_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -215,6 +233,10 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -231,6 +253,10 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -262,6 +288,8 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -276,6 +304,8 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -385,6 +415,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX7-LABEL: flat_system_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -401,6 +434,10 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -419,6 +456,10 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -452,6 +493,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -468,6 +511,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -591,6 +636,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX7-LABEL: flat_system_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -608,6 +656,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -628,6 +680,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -664,6 +720,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -681,6 +739,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -819,6 +879,9 @@ entry:
 define amdgpu_kernel void @flat_system_unordered_store(
 ; GFX7-LABEL: flat_system_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -830,6 +893,10 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -842,6 +909,10 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_system_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -865,6 +936,8 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -876,6 +949,8 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -959,6 +1034,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_store(
 ; GFX7-LABEL: flat_system_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -970,6 +1048,10 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -982,6 +1064,10 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1005,6 +1091,8 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1016,6 +1104,8 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1099,6 +1189,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_store(
 ; GFX7-LABEL: flat_system_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1111,6 +1204,10 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1125,6 +1222,10 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX10-CU-LABEL: flat_system_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1151,6 +1252,8 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1164,6 +1267,8 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1267,6 +1372,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_store(
 ; GFX7-LABEL: flat_system_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1279,6 +1387,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1293,6 +1405,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1319,6 +1435,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1332,6 +1450,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1435,6 +1555,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1447,6 +1570,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1460,6 +1587,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1485,6 +1616,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1497,6 +1630,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1587,6 +1722,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX7-LABEL: flat_system_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1601,6 +1739,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1618,6 +1760,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1648,6 +1794,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1663,6 +1811,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1772,6 +1922,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_atomicrmw(
 ; GFX7-LABEL: flat_system_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1785,6 +1938,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1800,6 +1957,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1828,6 +1989,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1842,6 +2005,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1952,6 +2117,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1967,6 +2135,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1986,6 +2158,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2019,6 +2195,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2036,6 +2214,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2165,6 +2345,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2180,6 +2363,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2199,6 +2386,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2232,6 +2423,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2249,6 +2442,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2378,6 +2573,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2394,6 +2592,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2412,6 +2614,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2445,6 +2651,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2461,6 +2669,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2584,6 +2794,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2601,6 +2814,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2621,6 +2838,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2657,6 +2878,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2675,6 +2898,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2818,6 +3043,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2835,6 +3063,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2855,6 +3087,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2891,6 +3127,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2909,6 +3147,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3052,6 +3292,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3077,6 +3320,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3102,6 +3349,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3152,6 +3403,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3167,6 +3420,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3283,6 +3538,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3310,6 +3568,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3339,6 +3601,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3394,6 +3660,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3412,6 +3680,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3547,6 +3817,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3573,6 +3846,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3600,6 +3877,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3653,6 +3934,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3670,6 +3953,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3806,6 +4091,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3834,6 +4122,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3865,6 +4157,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3923,6 +4219,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3943,6 +4241,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4098,6 +4398,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4126,6 +4429,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4157,6 +4464,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4215,6 +4526,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4235,6 +4548,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4390,6 +4705,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4417,6 +4735,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4446,6 +4768,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4501,6 +4827,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4519,6 +4847,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4654,6 +4984,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4681,6 +5014,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4710,6 +5047,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4765,6 +5106,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4783,6 +5126,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4918,6 +5263,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4946,6 +5294,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4977,6 +5329,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5035,6 +5391,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5055,6 +5413,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5210,6 +5570,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5238,6 +5601,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5269,6 +5636,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5327,6 +5698,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5347,6 +5720,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5502,6 +5877,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5530,6 +5908,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5561,6 +5943,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5619,6 +6005,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5639,6 +6027,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5794,6 +6184,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5822,6 +6215,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5853,6 +6250,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5911,6 +6312,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5931,6 +6334,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6086,6 +6491,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6114,6 +6522,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6145,6 +6557,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6203,6 +6619,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6223,6 +6641,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6378,6 +6798,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6406,6 +6829,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6437,6 +6864,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6495,6 +6926,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6515,6 +6948,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6670,6 +7105,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6698,6 +7136,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6729,6 +7171,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6787,6 +7233,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6807,6 +7255,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6962,6 +7412,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6990,6 +7443,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7021,6 +7478,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7079,6 +7540,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7099,6 +7562,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7254,6 +7719,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7282,6 +7750,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7310,6 +7782,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7366,6 +7842,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7384,6 +7862,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7527,6 +8007,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7556,6 +8039,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7586,6 +8073,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7644,6 +8135,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7664,6 +8157,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7821,6 +8316,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7850,6 +8348,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7880,6 +8382,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7939,6 +8445,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7959,6 +8467,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8122,6 +8632,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8152,6 +8665,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8184,6 +8701,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8245,6 +8766,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8267,6 +8790,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8444,6 +8969,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8474,6 +9002,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8506,6 +9038,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8567,6 +9103,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8589,6 +9127,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8766,6 +9306,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8795,6 +9338,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8825,6 +9372,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8883,6 +9434,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8903,6 +9456,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9060,6 +9615,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9089,6 +9647,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9119,6 +9681,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9177,6 +9743,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9197,6 +9765,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9354,6 +9924,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9384,6 +9957,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9416,6 +9993,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9477,6 +10058,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9499,6 +10082,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9676,6 +10261,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9706,6 +10294,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9738,6 +10330,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9799,6 +10395,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9821,6 +10419,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9998,6 +10598,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10028,6 +10631,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10060,6 +10667,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10121,6 +10732,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10143,6 +10756,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10320,6 +10935,9 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10350,6 +10968,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10382,6 +11004,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10443,6 +11069,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10465,6 +11093,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10642,6 +11272,9 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10672,6 +11305,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10704,6 +11341,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10765,6 +11406,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10787,6 +11430,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10964,6 +11609,9 @@ entry:
 define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10994,6 +11642,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11026,6 +11678,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11087,6 +11743,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11109,6 +11767,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11286,6 +11946,9 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11316,6 +11979,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11348,6 +12015,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11409,6 +12080,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11431,6 +12104,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11608,6 +12283,9 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11638,6 +12316,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11670,6 +12352,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11731,6 +12417,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11753,6 +12441,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11930,6 +12620,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ; GFX7-LABEL: flat_system_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11945,6 +12638,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11961,6 +12658,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11992,6 +12693,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12006,6 +12709,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12115,6 +12820,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ; GFX7-LABEL: flat_system_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12130,6 +12838,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12146,6 +12858,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12177,6 +12893,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12191,6 +12909,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12300,6 +13020,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX7-LABEL: flat_system_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12317,6 +13040,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12336,6 +13063,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12371,6 +13102,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12388,6 +13121,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12516,6 +13251,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12534,6 +13272,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12555,6 +13297,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12593,6 +13339,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12611,6 +13359,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12754,6 +13504,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ; GFX7-LABEL: flat_system_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12765,6 +13518,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12777,6 +13534,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12800,6 +13561,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12811,6 +13574,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12894,6 +13659,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ; GFX7-LABEL: flat_system_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12905,6 +13673,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12917,6 +13689,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12940,6 +13716,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12951,6 +13729,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13034,6 +13814,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_store(
 ; GFX7-LABEL: flat_system_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13046,6 +13829,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13060,6 +13847,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13086,6 +13877,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13099,6 +13892,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13202,6 +13997,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13214,6 +14012,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13228,6 +14030,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13254,6 +14060,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13267,6 +14075,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13370,6 +14180,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13382,6 +14195,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13395,6 +14212,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13420,6 +14241,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13432,6 +14255,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13522,6 +14347,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13536,6 +14364,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13552,6 +14384,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13581,6 +14417,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13596,6 +14434,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13703,6 +14543,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13716,6 +14559,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13731,6 +14578,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13759,6 +14610,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13773,6 +14626,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13883,6 +14738,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13898,6 +14756,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13916,6 +14778,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13948,6 +14814,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13965,6 +14833,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14092,6 +14962,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14107,6 +14980,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14125,6 +15002,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14157,6 +15038,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14174,6 +15057,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14301,6 +15186,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14318,6 +15206,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14337,6 +15229,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14372,6 +15268,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14389,6 +15287,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14517,6 +15417,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14535,6 +15438,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14556,6 +15463,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14594,6 +15505,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14613,6 +15526,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14761,6 +15676,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14779,6 +15697,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14800,6 +15722,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14838,6 +15764,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -14857,6 +15785,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15005,6 +15935,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15030,6 +15963,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15055,6 +15992,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15105,6 +16046,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15120,6 +16063,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15236,6 +16181,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15263,6 +16211,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15291,6 +16243,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15345,6 +16301,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15363,6 +16321,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15496,6 +16456,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15522,6 +16485,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15549,6 +16516,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15602,6 +16573,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15619,6 +16592,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15755,6 +16730,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15783,6 +16761,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15813,6 +16795,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15870,6 +16856,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15890,6 +16878,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16043,6 +17033,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16071,6 +17064,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16101,6 +17098,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16158,6 +17159,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16178,6 +17181,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16331,6 +17336,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16358,6 +17366,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16386,6 +17398,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16440,6 +17456,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16458,6 +17476,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16591,6 +17611,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16618,6 +17641,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16646,6 +17673,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16700,6 +17731,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16718,6 +17751,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16851,6 +17886,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16879,6 +17917,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16909,6 +17951,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16966,6 +18012,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16986,6 +18034,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17139,6 +18189,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17167,6 +18220,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17197,6 +18254,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17254,6 +18315,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17274,6 +18337,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17427,6 +18492,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17455,6 +18523,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17485,6 +18557,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17542,6 +18618,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17562,6 +18640,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17715,6 +18795,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17743,6 +18826,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17773,6 +18860,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17830,6 +18921,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17850,6 +18943,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18003,6 +19098,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18031,6 +19129,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18061,6 +19163,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18118,6 +19224,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18138,6 +19246,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18291,6 +19401,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18319,6 +19432,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18349,6 +19466,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18406,6 +19527,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18426,6 +19549,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18579,6 +19704,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18607,6 +19735,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18637,6 +19769,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18694,6 +19830,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18714,6 +19852,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18867,6 +20007,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18895,6 +20038,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18925,6 +20072,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18982,6 +20133,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19002,6 +20155,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19155,6 +20310,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19183,6 +20341,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19211,6 +20373,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19267,6 +20433,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19285,6 +20453,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19428,6 +20598,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19458,6 +20631,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19489,6 +20666,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19549,6 +20730,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19570,6 +20753,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19732,6 +20917,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19761,6 +20949,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19791,6 +20983,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19850,6 +21046,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19870,6 +21068,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20033,6 +21233,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20064,6 +21267,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20097,6 +21304,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20160,6 +21371,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20183,6 +21396,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20365,6 +21580,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20396,6 +21614,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20429,6 +21651,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20492,6 +21718,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20515,6 +21743,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20697,6 +21927,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20727,6 +21960,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20758,6 +21995,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20818,6 +22059,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20839,6 +22082,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21001,6 +22246,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21031,6 +22279,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21062,6 +22314,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21122,6 +22378,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21143,6 +22401,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21305,6 +22565,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21336,6 +22599,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21369,6 +22636,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21432,6 +22703,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21455,6 +22728,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21637,6 +22912,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21668,6 +22946,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21701,6 +22983,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21764,6 +23050,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21787,6 +23075,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21969,6 +23259,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22000,6 +23293,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22033,6 +23330,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22096,6 +23397,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22119,6 +23422,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22301,6 +23606,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22332,6 +23640,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22365,6 +23677,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22428,6 +23744,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22451,6 +23769,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22633,6 +23953,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22664,6 +23987,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22697,6 +24024,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22760,6 +24091,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22783,6 +24116,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22965,6 +24300,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22996,6 +24334,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23029,6 +24371,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23092,6 +24438,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23115,6 +24463,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23297,6 +24647,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23328,6 +24681,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23361,6 +24718,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23424,6 +24785,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23447,6 +24810,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23629,6 +24994,9 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23660,6 +25028,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23693,6 +25065,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23756,6 +25132,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23779,6 +25157,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index 6bf54ccabc9dad..b6c1ce72f86918 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -11,6 +11,9 @@
 define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX7-LABEL: flat_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -27,6 +30,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -44,6 +51,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -152,6 +163,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX7-LABEL: flat_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -181,6 +195,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
@@ -209,6 +227,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_mov_b32 s6, 2
@@ -399,6 +421,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX7-LABEL: flat_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -415,6 +440,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -432,6 +461,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -544,6 +577,9 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX7-LABEL: flat_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -573,6 +609,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -602,6 +642,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -797,6 +841,9 @@ entry:
 define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ; GFX7-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -813,6 +860,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -830,6 +881,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -936,6 +991,9 @@ entry:
 define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ; GFX7-LABEL: flat_volatile_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -948,6 +1006,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -962,6 +1024,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ;
 ; GFX10-CU-LABEL: flat_volatile_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index c7826181cc8dde..3b931330acdd7d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_wavefront_unordered_load(
 ; GFX7-LABEL: flat_wavefront_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -200,6 +215,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ; GFX7-LABEL: flat_wavefront_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -215,6 +233,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -231,6 +253,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -262,6 +288,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -276,6 +304,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -385,6 +415,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_load(
 ; GFX7-LABEL: flat_wavefront_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -400,6 +433,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -416,6 +453,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -447,6 +488,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -461,6 +504,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -570,6 +615,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ; GFX7-LABEL: flat_wavefront_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -585,6 +633,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -601,6 +653,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -632,6 +688,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -646,6 +704,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -755,6 +815,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_unordered_store(
 ; GFX7-LABEL: flat_wavefront_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -766,6 +829,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -778,6 +845,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -801,6 +872,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -812,6 +885,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -895,6 +970,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ; GFX7-LABEL: flat_wavefront_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -906,6 +984,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -918,6 +1000,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -941,6 +1027,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -952,6 +1040,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1035,6 +1125,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_store(
 ; GFX7-LABEL: flat_wavefront_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1046,6 +1139,10 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1058,6 +1155,10 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1081,6 +1182,8 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1092,6 +1195,8 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1175,6 +1280,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ; GFX7-LABEL: flat_wavefront_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1186,6 +1294,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1198,6 +1310,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1221,6 +1337,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1232,6 +1350,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1315,6 +1435,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1327,6 +1450,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1340,6 +1467,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1365,6 +1496,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1377,6 +1510,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1467,6 +1602,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1479,6 +1617,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1492,6 +1634,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1517,6 +1663,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1529,6 +1677,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1619,6 +1769,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1631,6 +1784,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1644,6 +1801,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1669,6 +1830,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1681,6 +1844,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1771,6 +1936,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1783,6 +1951,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1796,6 +1968,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1821,6 +1997,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1833,6 +2011,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1923,6 +2103,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1935,6 +2118,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1948,6 +2135,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1973,6 +2164,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1985,6 +2178,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2075,6 +2270,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2090,6 +2288,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2106,6 +2308,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2137,6 +2343,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2151,6 +2359,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2260,6 +2470,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2275,6 +2488,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2291,6 +2508,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2322,6 +2543,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2336,6 +2559,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2445,6 +2670,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2460,6 +2688,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2476,6 +2708,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2507,6 +2743,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2521,6 +2759,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2630,6 +2870,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2655,6 +2898,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2680,6 +2927,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2730,6 +2981,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2745,6 +2998,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2861,6 +3116,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2886,6 +3144,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2911,6 +3173,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2961,6 +3227,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2976,6 +3244,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3092,6 +3362,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3117,6 +3390,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3142,6 +3419,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3192,6 +3473,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3207,6 +3490,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3323,6 +3608,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3348,6 +3636,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3373,6 +3665,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3423,6 +3719,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3438,6 +3736,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3554,6 +3854,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3579,6 +3882,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3604,6 +3911,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3654,6 +3965,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3669,6 +3982,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3785,6 +4100,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3810,6 +4128,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3835,6 +4157,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3885,6 +4211,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3900,6 +4228,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4016,6 +4346,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4041,6 +4374,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4066,6 +4403,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4116,6 +4457,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4131,6 +4474,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4247,6 +4592,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4272,6 +4620,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4297,6 +4649,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4347,6 +4703,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4362,6 +4720,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4478,6 +4838,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4503,6 +4866,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4528,6 +4895,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4578,6 +4949,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4593,6 +4966,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4709,6 +5084,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4734,6 +5112,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4759,6 +5141,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4809,6 +5195,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4824,6 +5212,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4940,6 +5330,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4965,6 +5358,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4990,6 +5387,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5040,6 +5441,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5055,6 +5458,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5171,6 +5576,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5196,6 +5604,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5221,6 +5633,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5271,6 +5687,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5286,6 +5704,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5402,6 +5822,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5427,6 +5850,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5452,6 +5879,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5502,6 +5933,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5517,6 +5950,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5633,6 +6068,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5658,6 +6096,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5683,6 +6125,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5733,6 +6179,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5748,6 +6196,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5864,6 +6314,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5889,6 +6342,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5914,6 +6371,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5964,6 +6425,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5979,6 +6442,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6095,6 +6560,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6123,6 +6591,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6151,6 +6623,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6207,6 +6683,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6225,6 +6703,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6368,6 +6848,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6396,6 +6879,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6424,6 +6911,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6480,6 +6971,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6498,6 +6991,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6641,6 +7136,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6669,6 +7167,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6697,6 +7199,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6753,6 +7259,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6771,6 +7279,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6914,6 +7424,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6942,6 +7455,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6970,6 +7487,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7026,6 +7547,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7044,6 +7567,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7187,6 +7712,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7215,6 +7743,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7243,6 +7775,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7299,6 +7835,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7317,6 +7855,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7460,6 +8000,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7488,6 +8031,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7516,6 +8063,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7572,6 +8123,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7590,6 +8143,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7733,6 +8288,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7761,6 +8319,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7789,6 +8351,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7845,6 +8411,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7863,6 +8431,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8006,6 +8576,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8034,6 +8607,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8062,6 +8639,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8118,6 +8699,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8136,6 +8719,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8279,6 +8864,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8307,6 +8895,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8335,6 +8927,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8391,6 +8987,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8409,6 +9007,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8552,6 +9152,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8580,6 +9183,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8608,6 +9215,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8664,6 +9275,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8682,6 +9295,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8825,6 +9440,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8853,6 +9471,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8881,6 +9503,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8937,6 +9563,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8955,6 +9583,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9098,6 +9728,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9126,6 +9759,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9154,6 +9791,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9210,6 +9851,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9228,6 +9871,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9371,6 +10016,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9399,6 +10047,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9427,6 +10079,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9483,6 +10139,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9501,6 +10159,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9644,6 +10304,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9672,6 +10335,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9700,6 +10367,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9756,6 +10427,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9774,6 +10447,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9917,6 +10592,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9945,6 +10623,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9973,6 +10655,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10029,6 +10715,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10047,6 +10735,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10190,6 +10880,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ; GFX7-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10205,6 +10898,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10221,6 +10918,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10252,6 +10953,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10266,6 +10969,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10375,6 +11080,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10390,6 +11098,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10406,6 +11118,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10437,6 +11153,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10451,6 +11169,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10560,6 +11280,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10575,6 +11298,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10591,6 +11318,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10622,6 +11353,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10636,6 +11369,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10745,6 +11480,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10760,6 +11498,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10776,6 +11518,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10807,6 +11553,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10821,6 +11569,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10930,6 +11680,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ; GFX7-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10941,6 +11694,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -10953,6 +11710,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -10976,6 +11737,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -10987,6 +11750,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11070,6 +11835,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11081,6 +11849,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11093,6 +11865,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11116,6 +11892,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11127,6 +11905,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11210,6 +11990,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ; GFX7-LABEL: flat_wavefront_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11221,6 +12004,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11233,6 +12020,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11256,6 +12047,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11267,6 +12060,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11350,6 +12145,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11361,6 +12159,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11373,6 +12175,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11396,6 +12202,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11407,6 +12215,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11490,6 +12300,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11502,6 +12315,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11515,6 +12332,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11540,6 +12361,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11552,6 +12375,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11642,6 +12467,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11654,6 +12482,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11667,6 +12499,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11692,6 +12528,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11704,6 +12542,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11794,6 +12634,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11806,6 +12649,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11819,6 +12666,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11844,6 +12695,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11856,6 +12709,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11946,6 +12801,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11958,6 +12816,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11971,6 +12833,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11996,6 +12862,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12008,6 +12876,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12098,6 +12968,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12110,6 +12983,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12123,6 +13000,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12148,6 +13029,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12160,6 +13043,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12250,6 +13135,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12265,6 +13153,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12281,6 +13173,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12312,6 +13208,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12326,6 +13224,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12435,6 +13335,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12450,6 +13353,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12466,6 +13373,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12497,6 +13408,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12511,6 +13424,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12620,6 +13535,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12635,6 +13553,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12651,6 +13573,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12682,6 +13608,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12696,6 +13624,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12805,6 +13735,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -12830,6 +13763,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -12855,6 +13792,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -12905,6 +13846,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12920,6 +13863,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13036,6 +13981,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13061,6 +14009,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13086,6 +14038,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13136,6 +14092,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13151,6 +14109,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13267,6 +14227,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13292,6 +14255,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13317,6 +14284,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13367,6 +14338,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13382,6 +14355,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13498,6 +14473,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13523,6 +14501,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13548,6 +14530,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13598,6 +14584,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13613,6 +14601,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13729,6 +14719,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13754,6 +14747,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13779,6 +14776,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13829,6 +14830,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13844,6 +14847,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13960,6 +14965,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13985,6 +14993,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14010,6 +15022,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14060,6 +15076,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14075,6 +15093,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14191,6 +15211,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14216,6 +15239,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14241,6 +15268,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14291,6 +15322,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14306,6 +15339,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14422,6 +15457,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14447,6 +15485,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14472,6 +15514,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14522,6 +15568,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14537,6 +15585,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14653,6 +15703,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14678,6 +15731,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14703,6 +15760,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14753,6 +15814,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14768,6 +15831,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14884,6 +15949,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14909,6 +15977,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14934,6 +16006,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14984,6 +16060,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14999,6 +16077,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15115,6 +16195,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15140,6 +16223,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15165,6 +16252,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15215,6 +16306,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15230,6 +16323,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15346,6 +16441,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15371,6 +16469,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15396,6 +16498,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15446,6 +16552,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15461,6 +16569,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15577,6 +16687,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15602,6 +16715,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15627,6 +16744,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15677,6 +16798,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15692,6 +16815,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15808,6 +16933,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15833,6 +16961,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15858,6 +16990,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15908,6 +17044,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15923,6 +17061,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16039,6 +17179,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16064,6 +17207,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16089,6 +17236,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16139,6 +17290,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16154,6 +17307,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16270,6 +17425,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16298,6 +17456,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16326,6 +17488,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16382,6 +17548,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16400,6 +17568,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16543,6 +17713,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16571,6 +17744,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16599,6 +17776,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16655,6 +17836,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16673,6 +17856,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16816,6 +18001,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16844,6 +18032,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16872,6 +18064,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16928,6 +18124,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16946,6 +18144,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17089,6 +18289,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17117,6 +18320,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17145,6 +18352,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17201,6 +18412,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17219,6 +18432,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17362,6 +18577,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17390,6 +18608,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17418,6 +18640,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17474,6 +18700,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17492,6 +18720,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17635,6 +18865,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17663,6 +18896,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17691,6 +18928,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17747,6 +18988,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17765,6 +19008,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17908,6 +19153,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17936,6 +19184,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17964,6 +19216,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18020,6 +19276,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18038,6 +19296,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18181,6 +19441,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18209,6 +19472,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18237,6 +19504,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18293,6 +19564,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18311,6 +19584,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18454,6 +19729,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18482,6 +19760,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18510,6 +19792,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18566,6 +19852,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18584,6 +19872,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18727,6 +20017,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18755,6 +20048,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18783,6 +20080,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18839,6 +20140,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18857,6 +20160,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19000,6 +20305,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19028,6 +20336,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19056,6 +20368,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19112,6 +20428,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19130,6 +20448,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19273,6 +20593,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19301,6 +20624,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19329,6 +20656,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19385,6 +20716,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19403,6 +20736,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19546,6 +20881,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19574,6 +20912,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19602,6 +20944,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19658,6 +21004,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19676,6 +21024,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19819,6 +21169,9 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19847,6 +21200,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19875,6 +21232,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19931,6 +21292,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19949,6 +21312,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 8949e4b782f630..54baa032c3139c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -15,6 +15,9 @@
 define amdgpu_kernel void @flat_workgroup_unordered_load(
 ; GFX7-LABEL: flat_workgroup_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,6 +33,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -46,6 +53,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -77,6 +88,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -91,6 +104,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -200,6 +215,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ; GFX7-LABEL: flat_workgroup_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -215,6 +233,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -231,6 +253,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -262,6 +288,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -276,6 +304,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -385,6 +415,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_load(
 ; GFX7-LABEL: flat_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -401,6 +434,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -418,6 +455,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -451,6 +492,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -466,6 +509,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -584,6 +629,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX7-LABEL: flat_workgroup_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -601,6 +649,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -620,6 +672,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -655,6 +711,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -671,6 +729,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -800,6 +860,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_unordered_store(
 ; GFX7-LABEL: flat_workgroup_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -811,6 +874,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -823,6 +890,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -846,6 +917,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -857,6 +930,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -940,6 +1015,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ; GFX7-LABEL: flat_workgroup_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -951,6 +1029,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -963,6 +1045,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -986,6 +1072,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -997,6 +1085,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1080,6 +1170,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_store(
 ; GFX7-LABEL: flat_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1092,6 +1185,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1106,6 +1203,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1131,6 +1232,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1143,6 +1246,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1238,6 +1343,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; GFX7-LABEL: flat_workgroup_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1250,6 +1358,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1264,6 +1376,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1289,6 +1405,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1301,6 +1419,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1396,6 +1516,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1408,6 +1531,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1421,6 +1548,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1446,6 +1577,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1458,6 +1591,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1548,6 +1683,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1561,6 +1699,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1577,6 +1719,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1604,6 +1750,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1617,6 +1765,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1719,6 +1869,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1732,6 +1885,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1747,6 +1904,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1774,6 +1935,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1787,6 +1950,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1889,6 +2054,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1903,6 +2071,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1921,6 +2093,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1950,6 +2126,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1964,6 +2142,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2078,6 +2258,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2092,6 +2275,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2110,6 +2297,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2139,6 +2330,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2153,6 +2346,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2267,6 +2462,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2283,6 +2481,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2300,6 +2502,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2333,6 +2539,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2348,6 +2556,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2466,6 +2676,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2483,6 +2696,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2502,6 +2719,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2537,6 +2758,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2553,6 +2776,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2683,6 +2908,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2700,6 +2928,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2719,6 +2951,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2754,6 +2990,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2770,6 +3008,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2900,6 +3140,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2925,6 +3168,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2950,6 +3197,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3000,6 +3251,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3015,6 +3268,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3131,6 +3386,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3157,6 +3415,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3185,6 +3447,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3237,6 +3503,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3253,6 +3521,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3381,6 +3651,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3407,6 +3680,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3434,6 +3711,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3486,6 +3767,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3502,6 +3785,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3630,6 +3915,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3657,6 +3945,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3687,6 +3979,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3741,6 +4037,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3758,6 +4056,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3898,6 +4198,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3925,6 +4228,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3955,6 +4262,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4009,6 +4320,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4026,6 +4339,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4166,6 +4481,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4192,6 +4510,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4220,6 +4542,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4272,6 +4598,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4288,6 +4616,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4416,6 +4746,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4442,6 +4775,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4470,6 +4807,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4522,6 +4863,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4538,6 +4881,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4666,6 +5011,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4693,6 +5041,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4723,6 +5075,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4777,6 +5133,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4794,6 +5152,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4934,6 +5294,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4961,6 +5324,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4991,6 +5358,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5045,6 +5416,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5062,6 +5435,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5202,6 +5577,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5229,6 +5607,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5259,6 +5641,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5313,6 +5699,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5330,6 +5718,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5470,6 +5860,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5497,6 +5890,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5527,6 +5924,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5581,6 +5982,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5598,6 +6001,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5738,6 +6143,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -5766,6 +6174,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -5794,6 +6206,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -5850,6 +6266,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5868,6 +6286,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6011,6 +6431,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6040,6 +6463,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6069,6 +6496,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6127,6 +6558,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6146,6 +6579,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6298,6 +6733,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6327,6 +6765,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6357,6 +6799,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6415,6 +6861,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6434,6 +6882,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6589,6 +7039,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6619,6 +7072,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6650,6 +7107,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6710,6 +7171,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6730,6 +7193,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6894,6 +7359,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6924,6 +7392,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6955,6 +7427,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7015,6 +7491,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7035,6 +7513,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7199,6 +7679,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7228,6 +7711,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7257,6 +7744,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7315,6 +7806,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7334,6 +7827,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7486,6 +7981,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7515,6 +8013,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7544,6 +8046,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7602,6 +8108,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7621,6 +8129,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7773,6 +8283,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7803,6 +8316,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7834,6 +8351,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7894,6 +8415,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7914,6 +8437,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8078,6 +8603,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8108,6 +8636,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8139,6 +8671,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8199,6 +8735,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8219,6 +8757,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8383,6 +8923,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8413,6 +8956,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8444,6 +8991,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8504,6 +9055,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8524,6 +9077,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8688,6 +9243,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8718,6 +9276,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8749,6 +9311,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8809,6 +9375,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8829,6 +9397,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8993,6 +9563,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9023,6 +9596,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9054,6 +9631,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9114,6 +9695,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9134,6 +9717,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9298,6 +9883,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9328,6 +9916,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9359,6 +9951,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9419,6 +10015,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9439,6 +10037,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9603,6 +10203,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9633,6 +10236,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9664,6 +10271,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9724,6 +10335,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9744,6 +10357,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9908,6 +10523,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9938,6 +10556,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9969,6 +10591,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10029,6 +10655,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10049,6 +10677,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10213,6 +10843,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ; GFX7-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10228,6 +10861,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10244,6 +10881,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10275,6 +10916,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10289,6 +10932,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10398,6 +11043,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10413,6 +11061,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10429,6 +11081,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10460,6 +11116,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10474,6 +11132,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10583,6 +11243,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10598,6 +11261,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10616,6 +11283,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10647,6 +11318,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10661,6 +11334,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10778,6 +11453,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10793,6 +11471,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10813,6 +11495,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10844,6 +11530,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10858,6 +11546,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -10983,6 +11673,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ; GFX7-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10994,6 +11687,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11006,6 +11703,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11029,6 +11730,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11040,6 +11743,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11123,6 +11828,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11134,6 +11842,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11146,6 +11858,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11169,6 +11885,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11180,6 +11898,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11263,6 +11983,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; GFX7-LABEL: flat_workgroup_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11274,6 +11997,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11288,6 +12015,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11311,6 +12042,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11322,6 +12055,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11414,6 +12149,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11425,6 +12163,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11439,6 +12181,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11462,6 +12208,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11473,6 +12221,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11565,6 +12315,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11577,6 +12330,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11590,6 +12347,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11615,6 +12376,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11627,6 +12390,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11717,6 +12482,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11729,6 +12497,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11744,6 +12516,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11769,6 +12545,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11781,6 +12559,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11879,6 +12659,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11891,6 +12674,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11906,6 +12693,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11931,6 +12722,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11943,6 +12736,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12042,6 +12837,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12054,6 +12852,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12071,6 +12873,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12096,6 +12902,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12108,6 +12916,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12215,6 +13025,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12227,6 +13040,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12244,6 +13061,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12269,6 +13090,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12281,6 +13104,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12388,6 +13213,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12403,6 +13231,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12421,6 +13253,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12452,6 +13288,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12466,6 +13304,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12583,6 +13423,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12598,6 +13441,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12618,6 +13465,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12649,6 +13500,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12663,6 +13516,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12789,6 +13644,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12804,6 +13662,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12824,6 +13686,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12855,6 +13721,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12869,6 +13737,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -12995,6 +13865,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13020,6 +13893,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13045,6 +13922,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13095,6 +13976,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13110,6 +13993,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13226,6 +14111,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13251,6 +14139,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13278,6 +14170,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13328,6 +14224,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13343,6 +14241,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13467,6 +14367,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13492,6 +14395,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13519,6 +14426,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13569,6 +14480,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13584,6 +14497,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13709,6 +14624,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13734,6 +14652,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13763,6 +14685,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13813,6 +14739,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13828,6 +14756,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13961,6 +14891,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13986,6 +14919,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14015,6 +14952,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14065,6 +15006,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14080,6 +15023,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14213,6 +15158,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14238,6 +15186,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14265,6 +15217,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14315,6 +15271,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14330,6 +15288,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14454,6 +15414,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14479,6 +15442,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14506,6 +15473,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14556,6 +15527,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14571,6 +15544,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14695,6 +15670,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14720,6 +15698,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14749,6 +15731,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14799,6 +15785,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14814,6 +15802,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14947,6 +15937,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14972,6 +15965,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15001,6 +15998,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15051,6 +16052,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15066,6 +16069,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15199,6 +16204,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15224,6 +16232,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15253,6 +16265,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15303,6 +16319,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15318,6 +16336,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15451,6 +16471,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15476,6 +16499,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15505,6 +16532,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15555,6 +16586,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15570,6 +16603,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15703,6 +16738,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15728,6 +16766,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15757,6 +16799,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15807,6 +16853,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15822,6 +16870,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15955,6 +17005,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15980,6 +17033,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16009,6 +17066,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16059,6 +17120,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16074,6 +17137,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16207,6 +17272,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16232,6 +17300,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16261,6 +17333,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16311,6 +17387,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16326,6 +17404,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16459,6 +17539,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16484,6 +17567,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16513,6 +17600,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16563,6 +17654,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16578,6 +17671,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16711,6 +17806,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16739,6 +17837,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16767,6 +17869,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -16823,6 +17929,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16841,6 +17949,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16984,6 +18094,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17012,6 +18125,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17042,6 +18159,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17098,6 +18219,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17116,6 +18239,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17267,6 +18392,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17295,6 +18423,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17325,6 +18457,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17381,6 +18517,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17399,6 +18537,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17551,6 +18691,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17579,6 +18722,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17611,6 +18758,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17667,6 +18818,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17685,6 +18838,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17845,6 +19000,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17873,6 +19031,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17905,6 +19067,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17961,6 +19127,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17979,6 +19147,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18139,6 +19309,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18167,6 +19340,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18197,6 +19374,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18253,6 +19434,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18271,6 +19454,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18422,6 +19607,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18450,6 +19638,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18480,6 +19672,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18536,6 +19732,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18554,6 +19752,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18705,6 +19905,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18733,6 +19936,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18765,6 +19972,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18821,6 +20032,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18839,6 +20052,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18999,6 +20214,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19027,6 +20245,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19059,6 +20281,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19115,6 +20341,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19133,6 +20361,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19293,6 +20523,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19321,6 +20554,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19353,6 +20590,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19409,6 +20650,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19427,6 +20670,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19587,6 +20832,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19615,6 +20863,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19647,6 +20899,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19703,6 +20959,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19721,6 +20979,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19881,6 +21141,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19909,6 +21172,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19941,6 +21208,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19997,6 +21268,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20015,6 +21288,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20175,6 +21450,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20203,6 +21481,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20235,6 +21517,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20291,6 +21577,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20309,6 +21597,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20469,6 +21759,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20497,6 +21790,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20529,6 +21826,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20585,6 +21886,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20603,6 +21906,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20763,6 +22068,9 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20791,6 +22099,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
+; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20823,6 +22135,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
+; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20879,6 +22195,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20897,6 +22215,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index b56860991b1948..b20b4ab47a59bb 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -40,6 +40,9 @@ define amdgpu_kernel void @global_agent_unordered_load(
 ;
 ; GFX7-LABEL: global_agent_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -236,6 +239,9 @@ define amdgpu_kernel void @global_agent_monotonic_load(
 ;
 ; GFX7-LABEL: global_agent_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -433,6 +439,9 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ;
 ; GFX7-LABEL: global_agent_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -650,6 +659,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -872,6 +884,9 @@ define amdgpu_kernel void @global_agent_unordered_store(
 ;
 ; GFX7-LABEL: global_agent_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1039,6 +1054,9 @@ define amdgpu_kernel void @global_agent_monotonic_store(
 ;
 ; GFX7-LABEL: global_agent_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1207,6 +1225,9 @@ define amdgpu_kernel void @global_agent_release_store(
 ;
 ; GFX7-LABEL: global_agent_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1401,6 +1422,9 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1594,6 +1618,9 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1754,6 +1781,9 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1940,6 +1970,9 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2127,6 +2160,9 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2341,6 +2377,9 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2555,6 +2594,9 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2762,6 +2804,9 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2995,6 +3040,9 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3228,6 +3276,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3454,6 +3505,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3706,6 +3760,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3959,6 +4016,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4239,6 +4299,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4518,6 +4581,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4771,6 +4837,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5025,6 +5094,9 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5305,6 +5377,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5585,6 +5660,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5865,6 +5943,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6145,6 +6226,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6425,6 +6509,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6705,6 +6792,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6985,6 +7075,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7265,6 +7358,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7521,6 +7617,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7797,6 +7896,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8080,6 +8182,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8383,6 +8488,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8685,6 +8793,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8961,6 +9072,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9238,6 +9352,9 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9541,6 +9658,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9844,6 +9964,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10147,6 +10270,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10450,6 +10576,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10753,6 +10882,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11056,6 +11188,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11359,6 +11494,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11660,6 +11798,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11856,6 +11997,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12053,6 +12197,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12270,6 +12417,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12492,6 +12642,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12659,6 +12812,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12827,6 +12983,9 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13021,6 +13180,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13214,6 +13376,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13374,6 +13539,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13560,6 +13728,9 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13747,6 +13918,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13961,6 +14135,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14175,6 +14352,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14382,6 +14562,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14615,6 +14798,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14848,6 +15034,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15074,6 +15263,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15326,6 +15518,9 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15579,6 +15774,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15859,6 +16057,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16138,6 +16339,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16391,6 +16595,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16645,6 +16852,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16925,6 +17135,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17205,6 +17418,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17485,6 +17701,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17765,6 +17984,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18045,6 +18267,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18325,6 +18550,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18605,6 +18833,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18885,6 +19116,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19141,6 +19375,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19418,6 +19655,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19721,6 +19961,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20023,6 +20266,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20299,6 +20545,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20576,6 +20825,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20879,6 +21131,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21182,6 +21437,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21485,6 +21743,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21788,6 +22049,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22091,6 +22355,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22394,6 +22661,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22697,6 +22967,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 9b2b3a4cfa9bae..b97715e0ed8c95 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -35,6 +35,9 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ;
 ; GFX7-LABEL: global_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -240,6 +243,9 @@ define amdgpu_kernel void @global_nontemporal_load_1(
 ;
 ; GFX7-LABEL: global_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -498,6 +504,9 @@ define amdgpu_kernel void @global_nontemporal_store_0(
 ;
 ; GFX7-LABEL: global_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -698,6 +707,9 @@ define amdgpu_kernel void @global_nontemporal_store_1(
 ;
 ; GFX7-LABEL: global_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -945,6 +957,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
 ;
 ; GFX7-LABEL: global_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index afc46fbc23a67a..1d7ba89e339597 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -40,6 +40,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
 ;
 ; GFX7-LABEL: global_singlethread_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -236,6 +239,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -432,6 +438,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -628,6 +637,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -817,6 +829,9 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
 ;
 ; GFX7-LABEL: global_singlethread_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -984,6 +999,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1151,6 +1169,9 @@ define amdgpu_kernel void @global_singlethread_release_store(
 ;
 ; GFX7-LABEL: global_singlethread_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1318,6 +1339,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1485,6 +1509,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1643,6 +1670,9 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1801,6 +1831,9 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1959,6 +1992,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2117,6 +2153,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2277,6 +2316,9 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2463,6 +2505,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2649,6 +2694,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2837,6 +2885,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3061,6 +3112,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3285,6 +3339,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3509,6 +3566,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3733,6 +3793,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3957,6 +4020,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4181,6 +4247,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4405,6 +4474,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4629,6 +4701,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4853,6 +4928,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5077,6 +5155,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5301,6 +5382,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5525,6 +5609,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5749,6 +5836,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5973,6 +6063,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6200,6 +6293,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6455,6 +6551,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6710,6 +6809,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6965,6 +7067,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7220,6 +7325,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7475,6 +7583,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7730,6 +7841,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7985,6 +8099,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8240,6 +8357,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8495,6 +8615,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8750,6 +8873,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9005,6 +9131,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9260,6 +9389,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9515,6 +9647,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9770,6 +9905,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10025,6 +10163,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10221,6 +10362,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10417,6 +10561,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10613,6 +10760,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10802,6 +10952,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10969,6 +11122,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11136,6 +11292,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11303,6 +11462,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11470,6 +11632,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11628,6 +11793,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11786,6 +11954,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11944,6 +12115,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12102,6 +12276,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12262,6 +12439,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12448,6 +12628,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12634,6 +12817,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12822,6 +13008,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13046,6 +13235,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13270,6 +13462,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13494,6 +13689,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13718,6 +13916,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13942,6 +14143,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14166,6 +14370,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14390,6 +14597,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14614,6 +14824,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14838,6 +15051,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15062,6 +15278,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15286,6 +15505,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15510,6 +15732,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15734,6 +15959,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15958,6 +16186,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16185,6 +16416,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16440,6 +16674,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16695,6 +16932,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16950,6 +17190,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17205,6 +17448,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17460,6 +17706,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17715,6 +17964,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17970,6 +18222,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18225,6 +18480,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18480,6 +18738,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18735,6 +18996,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18990,6 +19254,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19245,6 +19512,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19500,6 +19770,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19755,6 +20028,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 62a4f3b43b2dcd..c1711040bf4014 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -40,6 +40,9 @@ define amdgpu_kernel void @global_system_unordered_load(
 ;
 ; GFX7-LABEL: global_system_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -236,6 +239,9 @@ define amdgpu_kernel void @global_system_monotonic_load(
 ;
 ; GFX7-LABEL: global_system_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -433,6 +439,9 @@ define amdgpu_kernel void @global_system_acquire_load(
 ;
 ; GFX7-LABEL: global_system_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -652,6 +661,9 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ;
 ; GFX7-LABEL: global_system_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -876,6 +888,9 @@ define amdgpu_kernel void @global_system_unordered_store(
 ;
 ; GFX7-LABEL: global_system_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1043,6 +1058,9 @@ define amdgpu_kernel void @global_system_monotonic_store(
 ;
 ; GFX7-LABEL: global_system_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1211,6 +1229,9 @@ define amdgpu_kernel void @global_system_release_store(
 ;
 ; GFX7-LABEL: global_system_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1407,6 +1428,9 @@ define amdgpu_kernel void @global_system_seq_cst_store(
 ;
 ; GFX7-LABEL: global_system_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1602,6 +1626,9 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1762,6 +1789,9 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1950,6 +1980,9 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2139,6 +2172,9 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2357,6 +2393,9 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2575,6 +2614,9 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2784,6 +2826,9 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3021,6 +3066,9 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3258,6 +3306,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3484,6 +3535,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3738,6 +3792,9 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3993,6 +4050,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4277,6 +4337,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4560,6 +4623,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4815,6 +4881,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5071,6 +5140,9 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5355,6 +5427,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5639,6 +5714,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5923,6 +6001,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6207,6 +6288,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6463,6 +6547,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6742,6 +6829,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7049,6 +7139,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7355,6 +7448,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7633,6 +7729,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7912,6 +8011,9 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8219,6 +8321,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8526,6 +8631,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8833,6 +8941,9 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9140,6 +9251,9 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9447,6 +9561,9 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9754,6 +9871,9 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10061,6 +10181,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10366,6 +10489,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_system_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10562,6 +10688,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10759,6 +10888,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10978,6 +11110,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11202,6 +11337,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_system_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11369,6 +11507,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11537,6 +11678,9 @@ define amdgpu_kernel void @global_system_one_as_release_store(
 ;
 ; GFX7-LABEL: global_system_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11733,6 +11877,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11928,6 +12075,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12088,6 +12238,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12276,6 +12429,9 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12465,6 +12621,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12683,6 +12842,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12901,6 +13063,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13110,6 +13275,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13347,6 +13515,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13584,6 +13755,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13810,6 +13984,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14064,6 +14241,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14319,6 +14499,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14603,6 +14786,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14886,6 +15072,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15141,6 +15330,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15397,6 +15589,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15681,6 +15876,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15965,6 +16163,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16249,6 +16450,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16533,6 +16737,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16817,6 +17024,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17101,6 +17311,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17385,6 +17598,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17669,6 +17885,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17925,6 +18144,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18203,6 +18425,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18488,6 +18713,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18795,6 +19023,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19101,6 +19332,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19379,6 +19613,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19658,6 +19895,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19965,6 +20205,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20272,6 +20515,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20579,6 +20825,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20886,6 +21135,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21193,6 +21445,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21500,6 +21755,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21807,6 +22065,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index a98efb49b4b72b..5581d76f539746 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -36,6 +36,9 @@ define amdgpu_kernel void @global_volatile_load_0(
 ;
 ; GFX7-LABEL: global_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -190,6 +193,9 @@ define amdgpu_kernel void @global_volatile_load_1(
 ;
 ; GFX7-LABEL: global_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -381,6 +387,9 @@ define amdgpu_kernel void @global_volatile_store_0(
 ;
 ; GFX7-LABEL: global_volatile_store_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -544,6 +553,9 @@ define amdgpu_kernel void @global_volatile_store_1(
 ;
 ; GFX7-LABEL: global_volatile_store_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -741,6 +753,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
 ;
 ; GFX7-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -884,6 +899,9 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ;
 ; GFX7-LABEL: global_volatile_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index f805e2cf37006c..3c1e0e1d3d9afc 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -40,6 +40,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
 ;
 ; GFX7-LABEL: global_wavefront_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -236,6 +239,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -432,6 +438,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -628,6 +637,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -817,6 +829,9 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
 ;
 ; GFX7-LABEL: global_wavefront_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -984,6 +999,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1151,6 +1169,9 @@ define amdgpu_kernel void @global_wavefront_release_store(
 ;
 ; GFX7-LABEL: global_wavefront_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1318,6 +1339,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1485,6 +1509,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1643,6 +1670,9 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1801,6 +1831,9 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1959,6 +1992,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2117,6 +2153,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2277,6 +2316,9 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2463,6 +2505,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2649,6 +2694,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2837,6 +2885,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3061,6 +3112,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3285,6 +3339,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3509,6 +3566,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3733,6 +3793,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3957,6 +4020,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4181,6 +4247,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4405,6 +4474,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4629,6 +4701,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4853,6 +4928,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5077,6 +5155,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5301,6 +5382,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5525,6 +5609,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5749,6 +5836,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5973,6 +6063,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6200,6 +6293,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6455,6 +6551,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6710,6 +6809,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6965,6 +7067,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7220,6 +7325,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7475,6 +7583,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7730,6 +7841,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7985,6 +8099,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8240,6 +8357,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8495,6 +8615,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8750,6 +8873,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9005,6 +9131,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9260,6 +9389,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9515,6 +9647,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9770,6 +9905,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10025,6 +10163,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10221,6 +10362,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10417,6 +10561,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10613,6 +10760,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10802,6 +10952,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10969,6 +11122,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11136,6 +11292,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11303,6 +11462,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11470,6 +11632,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11628,6 +11793,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11786,6 +11954,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11944,6 +12115,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12102,6 +12276,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12262,6 +12439,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12448,6 +12628,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12634,6 +12817,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12822,6 +13008,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13046,6 +13235,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13270,6 +13462,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13494,6 +13689,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13718,6 +13916,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13942,6 +14143,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14166,6 +14370,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14390,6 +14597,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14614,6 +14824,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14838,6 +15051,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15062,6 +15278,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15286,6 +15505,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15510,6 +15732,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15734,6 +15959,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15958,6 +16186,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16185,6 +16416,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16440,6 +16674,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16695,6 +16932,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16950,6 +17190,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17205,6 +17448,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17460,6 +17706,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17715,6 +17964,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17970,6 +18222,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18225,6 +18480,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18480,6 +18738,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18735,6 +18996,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18990,6 +19254,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19245,6 +19512,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19500,6 +19770,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19755,6 +20028,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 30bf4920715352..8d4b24de61deb4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -40,6 +40,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
 ;
 ; GFX7-LABEL: global_workgroup_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -236,6 +239,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -432,6 +438,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -636,6 +645,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -841,6 +853,9 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
 ;
 ; GFX7-LABEL: global_workgroup_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1008,6 +1023,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1176,6 +1194,9 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ;
 ; GFX7-LABEL: global_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1362,6 +1383,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1547,6 +1571,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1705,6 +1732,9 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1874,6 +1904,9 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2051,6 +2084,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2238,6 +2274,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2426,6 +2465,9 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2620,6 +2662,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2832,6 +2877,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3045,6 +3093,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3269,6 +3320,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3504,6 +3558,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3747,6 +3804,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4000,6 +4060,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4252,6 +4315,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4486,6 +4552,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4721,6 +4790,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4974,6 +5046,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5227,6 +5302,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5480,6 +5558,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5733,6 +5814,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5986,6 +6070,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6239,6 +6326,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6492,6 +6582,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6747,6 +6840,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7002,6 +7098,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7265,6 +7364,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7539,6 +7641,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7820,6 +7925,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8100,6 +8208,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8362,6 +8473,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8625,6 +8739,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8906,6 +9023,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9187,6 +9307,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9468,6 +9591,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9749,6 +9875,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10030,6 +10159,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10311,6 +10443,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10592,6 +10727,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10872,6 +11010,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11068,6 +11209,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11264,6 +11408,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11467,6 +11614,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11669,6 +11819,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11836,6 +11989,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12003,6 +12159,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12181,6 +12340,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12359,6 +12521,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12517,6 +12682,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12685,6 +12853,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12854,6 +13025,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13033,6 +13207,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13214,6 +13391,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13407,6 +13587,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13611,6 +13794,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13817,6 +14003,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14041,6 +14230,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14275,6 +14467,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14510,6 +14705,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14755,6 +14953,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15000,6 +15201,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15234,6 +15438,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15468,6 +15675,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15713,6 +15923,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15958,6 +16171,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16203,6 +16419,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16448,6 +16667,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16693,6 +16915,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16938,6 +17163,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17183,6 +17411,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17431,6 +17662,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17686,6 +17920,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17948,6 +18185,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18214,6 +18454,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18487,6 +18730,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18760,6 +19006,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19022,6 +19271,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19284,6 +19536,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19557,6 +19812,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19830,6 +20088,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20103,6 +20364,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20376,6 +20640,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20649,6 +20916,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20922,6 +21192,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21195,6 +21468,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index ba9711333a1946..403e3012baa222 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -37,6 +37,9 @@ define amdgpu_kernel void @local_nontemporal_load_0(
 ;
 ; GFX7-LABEL: local_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -238,6 +241,9 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ;
 ; GFX7-LABEL: local_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -881,6 +887,9 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
 ;
 ; GFX7-LABEL: local_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index a52dd9b3401696..454b11e3e67251 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -33,6 +33,9 @@ define amdgpu_kernel void @local_volatile_load_0(
 ;
 ; GFX7-LABEL: local_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -178,6 +181,9 @@ define amdgpu_kernel void @local_volatile_load_1(
 ;
 ; GFX7-LABEL: local_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 4e08065e879fd8..9166046b55fca3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -37,7 +37,10 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX7-LABEL: private_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -53,7 +56,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -69,7 +72,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -110,7 +113,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -126,7 +129,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -246,7 +249,10 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX7-LABEL: private_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -264,7 +270,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
@@ -282,7 +288,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
@@ -327,7 +333,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
@@ -348,7 +354,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
@@ -499,7 +505,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX7-LABEL: private_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -514,7 +520,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -530,7 +536,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -565,7 +571,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -581,7 +587,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -695,7 +701,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX7-LABEL: private_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -712,7 +718,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -728,7 +734,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -765,7 +771,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -784,7 +790,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -929,7 +935,10 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX7-LABEL: private_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -945,7 +954,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_volatile_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -961,7 +970,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_volatile_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1002,7 +1011,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1018,7 +1027,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index a68b5f36b806ed..ddee2f39c4ba16 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -37,7 +37,10 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX7-LABEL: private_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -53,7 +56,7 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX10-WGP-LABEL: private_volatile_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -69,7 +72,7 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX10-CU-LABEL: private_volatile_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -196,7 +199,10 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX7-LABEL: private_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -214,7 +220,7 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX10-WGP-LABEL: private_volatile_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
@@ -232,7 +238,7 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX10-CU-LABEL: private_volatile_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
@@ -378,7 +384,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX7-LABEL: private_volatile_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -394,7 +400,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX10-WGP-LABEL: private_volatile_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -411,7 +417,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX10-CU-LABEL: private_volatile_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -539,7 +545,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX7-LABEL: private_volatile_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s13
+; GFX7-NEXT:    s_add_u32 s0, s0, s15
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -557,7 +563,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX10-WGP-LABEL: private_volatile_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -574,7 +580,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX10-CU-LABEL: private_volatile_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
index d070dc3b770f81..1fd311f225db4a 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
@@ -203,13 +203,13 @@ attributes #5 = { "amdgpu-flat-work-group-size"="128,512" }
 attributes #6 = { "amdgpu-flat-work-group-size"="512,512" }
 attributes #7 = { "amdgpu-flat-work-group-size"="64,256" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index f62f1d57aec8e4..9577f2a932f966 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -399,26 +399,26 @@ attributes #17 = { "amdgpu-waves-per-eu"="5,8" }
 attributes #18 = { "amdgpu-waves-per-eu"="9,10" }
 attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
index c1d647c5d3b907..9758afa3b49912 100644
--- a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
@@ -19,5 +19,5 @@ define void @hoge()  {
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index 384a9c4043a1d3..ab51d6e249e903 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -191,11 +191,11 @@ define amdgpu_kernel void @kernel_lds_recursion() {
 !1 = !{i32 1, !"amdhsa_code_object_version", i32 400}
 
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index d430ba758572d6..2292b45ddf15ae 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -7,7 +7,7 @@
 define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 {
 ; GCN-LABEL: partial_no_vgprs_last_sgpr_spill:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 s0, s0, s13
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index cca7b49996ff3b..183cdb2e1f8627 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -80,8 +80,8 @@ define amdgpu_kernel void @test_simple_indirect_call() {
 ;.
 ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" }
 ;.
-; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
 ;.
 ; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
index 049db01badacf9..2ab48479fa1b08 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
@@ -31,6 +31,6 @@ define amdgpu_kernel void @kernel1() #1 {
 
 attributes #0 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
index c9387f196dff96..cc58d34a8b255b 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
@@ -98,7 +98,7 @@ define amdgpu_kernel void @kernel2() #0 {
 attributes #0 = { "uniform-work-group-size"="true" }
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR2]] = { "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
index 7183da2c5efc36..33298bde89b975 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
@@ -41,6 +41,6 @@ define amdgpu_kernel void @kernel3() #2 {
 
 attributes #2 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
index 6ed04cf63d20be..6fe85cd7c9e654 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @kernel2() #2 {
 
 attributes #1 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
index d5ba2fd617c6ef..f6ab402e012325 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
@@ -52,8 +52,8 @@ attributes #0 = { nounwind }
 attributes #1 = { "uniform-work-group-size"="false" }
 attributes #2 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
index 7f0dfeaf75c808..37e3376ef9d500 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
@@ -101,7 +101,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %m) #1 {
 attributes #0 = { nounwind readnone }
 attributes #1 = { "uniform-work-group-size"="true" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
index 8616c73ad51c13..7f83686bc756e8 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
@@ -61,6 +61,6 @@ define amdgpu_kernel void @kernel3() #0 {
 
 attributes #0 = { "uniform-work-group-size"="false" }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index 50927a2cf21afe..3f2b046ef16615 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -11,7 +11,7 @@
 define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-LABEL: __omp_offloading_16_dd2df_main_l9:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_add_u32 s0, s0, s13
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v0

>From 0ec30aead6410d5b98a65bad19834d92dcbc1f2a Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Fri, 16 Aug 2024 10:43:34 -0700
Subject: [PATCH 2/8] (1) Use getCalledFunction instead of getCalledOperand (2)
 other minor code change based on reviews (3) fix test files.

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |  22 +-
 .../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll  | 368 ++++++++++++--
 .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll  | 390 +++++++++++++--
 .../AMDGPU/GlobalISel/extractelement.ll       |  69 +--
 ...licit-kernarg-backend-usage-global-isel.ll |  50 +-
 .../GlobalISel/insertelement-stack-lower.ll   |   2 +-
 .../AMDGPU/GlobalISel/lds-global-value.ll     |   5 +-
 .../GlobalISel/llvm.amdgcn.if.break.i64.ll    |   3 +
 .../GlobalISel/llvm.amdgcn.trig.preop.ll      |  24 +
 .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll |  33 ++
 .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll |  33 ++
 .../abi-attribute-hints-undefined-behavior.ll |  18 +-
 llvm/test/CodeGen/AMDGPU/addrspacecast.ll     |  10 +-
 llvm/test/CodeGen/AMDGPU/always-uniform.ll    |   3 +
 ...amdgpu-codegenprepare-fold-binop-select.ll |   3 +
 .../CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll   |   4 +-
 .../annotate-kernel-features-hsa-call.ll      |   2 +-
 .../AMDGPU/annotate-kernel-features-hsa.ll    |   4 +-
 .../attr-amdgpu-flat-work-group-size.ll       |   4 +-
 .../CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll    |   6 +-
 .../AMDGPU/attr-amdgpu-waves-per-eu.ll        |   4 +-
 .../attributor-flatscratchinit-globalisel.ll  |   2 +-
 .../AMDGPU/attributor-flatscratchinit.ll      | 257 +++++-----
 .../AMDGPU/call-graph-register-usage.ll       |   8 +-
 llvm/test/CodeGen/AMDGPU/code-object-v3.ll    |  12 +-
 .../CodeGen/AMDGPU/combine-reg-or-const.ll    |   3 +
 ...dagcomb-extract-vec-elt-different-sizes.ll |   2 +
 ...cannot-create-empty-or-backward-segment.ll |   2 +-
 .../expand-scalar-carry-out-select-user.ll    |   3 +
 llvm/test/CodeGen/AMDGPU/fabs.f16.ll          |  66 +++
 .../fast-unaligned-load-store.global.ll       |  20 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.ll     | 236 ++++++++-
 .../flat-for-global-subtarget-feature.ll      |   7 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll  |  63 ++-
 .../AMDGPU/fmul-2-combine-multi-use.ll        |  48 ++
 llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll     |  60 +++
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |   3 +
 llvm/test/CodeGen/AMDGPU/fneg.f16.ll          |  62 ++-
 llvm/test/CodeGen/AMDGPU/half.ll              | 231 +++++++++
 .../AMDGPU/hsa-metadata-kernel-code-props.ll  |   9 +-
 llvm/test/CodeGen/AMDGPU/hsa.ll               |   4 +-
 .../AMDGPU/implicit-kernarg-backend-usage.ll  |  50 +-
 .../indirect-call-set-from-other-function.ll  |   2 +-
 llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll   |  24 +-
 .../AMDGPU/insert_vector_elt.v2bf16.ll        |  58 ++-
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 214 +++++++-
 llvm/test/CodeGen/AMDGPU/ipra.ll              |   2 +-
 llvm/test/CodeGen/AMDGPU/kernarg-size.ll      |   2 +-
 llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll  | 248 +++++++---
 .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll       |  37 +-
 .../AMDGPU/llvm.amdgcn.readfirstlane.ll       |  70 ++-
 .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll    | 114 ++++-
 .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll   | 126 ++++-
 llvm/test/CodeGen/AMDGPU/load-constant-f64.ll |   6 +
 llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 127 ++++-
 llvm/test/CodeGen/AMDGPU/load-constant-i32.ll |  85 +++-
 llvm/test/CodeGen/AMDGPU/load-constant-i64.ll |  18 +
 llvm/test/CodeGen/AMDGPU/load-constant-i8.ll  | 164 ++++++-
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   | 129 ++++-
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll   |  83 +++-
 llvm/test/CodeGen/AMDGPU/load-select-ptr.ll   |   3 +-
 .../AMDGPU/lower-module-lds-via-hybrid.ll     |  16 +-
 .../AMDGPU/lower-module-lds-via-table.ll      |  15 +-
 .../CodeGen/AMDGPU/mad24-get-global-id.ll     |   2 +-
 .../match-perm-extract-vector-elt-bug.ll      |   8 +-
 llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll    |  20 +-
 llvm/test/CodeGen/AMDGPU/min.ll               | 210 ++++++++
 llvm/test/CodeGen/AMDGPU/pack.v2f16.ll        |  21 +
 llvm/test/CodeGen/AMDGPU/pack.v2i16.ll        |  18 +
 ...al-regcopy-and-spill-missed-at-regalloc.ll |  56 +--
 .../AMDGPU/propagate-flat-work-group-size.ll  |   2 +-
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  |   4 +-
 .../AMDGPU/remove-no-kernel-id-attribute.ll   |   8 +-
 llvm/test/CodeGen/AMDGPU/sad.ll               |  68 ++-
 .../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll  |  16 +
 .../scc-clobbered-sgpr-to-vmem-spill.ll       | 456 +++++++++---------
 llvm/test/CodeGen/AMDGPU/shift-i128.ll        |  24 +-
 .../CodeGen/AMDGPU/simple-indirect-call-2.ll  |   7 +-
 llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll    |  70 ++-
 .../CodeGen/AMDGPU/spill-vector-superclass.ll |   6 +-
 llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll |   6 +
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll  |   2 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll  |   2 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll   |   9 +-
 llvm/test/CodeGen/AMDGPU/trap-abis.ll         |  16 +-
 llvm/test/CodeGen/AMDGPU/udiv.ll              |  45 ++
 llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll    |  91 +++-
 ...ine-function-info-long-branch-reg-debug.ll |   7 +-
 .../machine-function-info-long-branch-reg.ll  |   7 +-
 89 files changed, 4094 insertions(+), 835 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 2017ab70c96f61..1d3a3cea3e2187 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -435,17 +435,11 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       return;
     }
 
-    bool HasAllocaOrASCast = false;
-    for (BasicBlock &BB : *F) {
-      for (Instruction &I : BB) {
-        if (isa<AllocaInst>(I) || isa<AddrSpaceCastInst>(I)) {
-          HasAllocaOrASCast = true;
-          removeAssumedBits(FLAT_SCRATCH_INIT);
-          break;
-        }
+    for (Instruction &I : instructions(F)) {
+      if (isa<AllocaInst>(I) || isa<AddrSpaceCastInst>(I)) {
+        removeAssumedBits(FLAT_SCRATCH_INIT);
+        return;
       }
-      if (HasAllocaOrASCast)
-        break;
     }
   }
 
@@ -702,13 +696,12 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     // no-flat-scratch-init.
     auto CheckForNoFlatScratchInit = [&](Instruction &I) {
       const auto &CB = cast<CallBase>(I);
-      const Value *CalleeOp = CB.getCalledOperand();
-      const Function *Callee = dyn_cast<Function>(CalleeOp);
+      const Function *Callee = CB.getCalledFunction();
       if (!Callee) // indirect call
         return CB.isInlineAsm();
 
       if (Callee->isIntrinsic())
-        return true;
+        return Callee->getIntrinsicID() != Intrinsic::amdgcn_addrspacecast_nonnull;
 
       const auto *CalleeInfo = A.getAAFor<AAAMDAttributes>(
           *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
@@ -717,7 +710,8 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
 
     bool UsedAssumedInformation = false;
     // If any callee is false (i.e. need FlatScratchInit),
-    // checkForAllCallLikeInstructions returns false
+    // checkForAllCallLikeInstructions returns false, in which case this
+    // function returns true.
     return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
                                               UsedAssumedInformation);
   }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index 705bcbddf227a6..cb64c25b5f080e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -20,11 +20,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -35,11 +38,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -99,11 +105,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v0 offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -114,11 +123,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v0 offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -291,6 +303,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_dec_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -306,6 +321,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_dec_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -365,6 +383,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_dec_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -382,6 +403,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_dec_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -444,6 +468,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ; CI-LABEL: global_atomic_dec_ret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -461,6 +488,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ; VI-LABEL: global_atomic_dec_ret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -523,6 +553,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_dec_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -535,6 +568,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_dec_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -585,6 +621,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_dec_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -599,6 +638,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_dec_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -652,6 +694,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_dec_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -666,6 +711,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_dec_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -720,7 +768,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    v_mov_b32_e32 v3, 42
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -728,6 +778,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -742,7 +793,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 42
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -750,6 +803,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -814,6 +868,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -831,6 +888,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -890,6 +950,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -905,6 +968,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -920,6 +986,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_ret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -934,6 +1002,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -970,6 +1042,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -987,6 +1062,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -1004,6 +1082,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1018,6 +1098,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1057,6 +1141,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; CI-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -1074,6 +1161,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -1091,6 +1181,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1105,6 +1197,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1144,6 +1240,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1156,6 +1255,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1168,6 +1270,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1179,6 +1283,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1211,6 +1319,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -1225,6 +1336,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -1239,6 +1353,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1250,6 +1366,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1285,6 +1405,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -1299,6 +1422,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -1313,6 +1439,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1324,6 +1452,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1360,7 +1492,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    v_mov_b32_e32 v3, 42
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1368,6 +1502,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -1382,7 +1517,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 42
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1390,6 +1527,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -1404,6 +1542,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1422,6 +1562,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 42
@@ -1478,6 +1622,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1495,6 +1642,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1512,6 +1662,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -1525,6 +1677,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1571,10 +1727,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1592,10 +1751,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1613,7 +1775,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_ret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -1628,6 +1792,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1666,12 +1834,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1689,12 +1860,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1712,7 +1886,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -1727,6 +1903,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1768,10 +1948,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1781,10 +1964,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1794,7 +1980,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -1806,6 +1994,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1840,12 +2032,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1855,12 +2050,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1870,7 +2068,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -1882,6 +2082,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1919,12 +2123,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1934,12 +2141,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1949,7 +2159,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -1961,6 +2173,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1999,6 +2215,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2025,6 +2244,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2051,12 +2273,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2070,6 +2294,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2128,6 +2356,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2146,6 +2377,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2164,12 +2398,14 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_dec_x2 v[3:4], v[1:2] offset:40
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2178,6 +2414,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2231,8 +2471,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v2 offset:8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    flat_store_dword v[0:1], v3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2249,8 +2492,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v2 offset:8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2326,7 +2572,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -2342,7 +2591,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -2410,7 +2662,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -2426,7 +2681,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -2612,10 +2870,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_dec_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2628,10 +2889,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_dec_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -2691,12 +2955,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_dec_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2709,12 +2976,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_dec_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -2775,12 +3045,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; CI-LABEL: global_atomic_dec_ret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2793,12 +3066,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; VI-LABEL: global_atomic_dec_ret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -2859,10 +3135,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_dec_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2872,10 +3151,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_dec_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -2926,12 +3208,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_dec_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2941,12 +3226,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_dec_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -2998,12 +3286,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_dec_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3013,12 +3304,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_dec_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -3071,6 +3365,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3094,6 +3391,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3170,6 +3470,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3188,6 +3491,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3258,7 +3564,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v4, s3
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v3, s2
 ; CI-NEXT:    flat_store_dword v[3:4], v0
@@ -3277,7 +3586,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v4, s3
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-NEXT:    flat_store_dword v[3:4], v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index b3a7e65f771c43..00ff2d7a35d561 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -20,11 +20,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -35,11 +38,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -99,11 +105,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0 offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -114,11 +123,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0 offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -291,6 +303,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_inc_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -306,6 +321,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_inc_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -365,6 +383,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_inc_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -382,6 +403,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_inc_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -444,6 +468,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
 ; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -461,6 +488,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
 ; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -523,6 +553,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_inc_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -535,6 +568,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_inc_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -585,6 +621,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_inc_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -599,6 +638,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_inc_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -652,6 +694,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_inc_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -666,6 +711,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_inc_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -720,7 +768,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    v_mov_b32_e32 v3, 42
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -728,6 +778,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -742,7 +793,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 42
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -750,6 +803,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -814,6 +868,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -831,6 +888,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -896,8 +956,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v2 offset:8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    flat_store_dword v[0:1], v3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -914,8 +977,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v2 offset:8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -991,7 +1057,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -1007,7 +1076,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -1075,7 +1147,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -1091,7 +1166,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -1277,10 +1355,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_inc_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1293,10 +1374,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_inc_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1356,12 +1440,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_inc_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1374,12 +1461,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_inc_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1440,12 +1530,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; CI-LABEL: global_atomic_inc_ret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1458,12 +1551,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; VI-LABEL: global_atomic_inc_ret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1524,10 +1620,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_inc_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1537,10 +1636,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_inc_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1591,12 +1693,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_inc_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1606,12 +1711,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_inc_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1663,12 +1771,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_inc_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1678,12 +1789,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_inc_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1736,6 +1850,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1759,6 +1876,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1835,6 +1955,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1853,6 +1976,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1916,6 +2042,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -1931,6 +2060,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -1946,6 +2078,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_ret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1960,6 +2094,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1996,6 +2134,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -2013,6 +2154,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -2030,6 +2174,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -2044,6 +2190,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2083,6 +2233,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; CI-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -2100,6 +2253,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; VI-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -2117,6 +2273,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -2131,6 +2289,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2170,6 +2332,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2182,6 +2347,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2194,6 +2362,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -2205,6 +2375,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2237,6 +2411,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -2251,6 +2428,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -2265,6 +2445,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -2276,6 +2458,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2311,6 +2497,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -2325,6 +2514,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -2339,6 +2531,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -2350,6 +2544,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2386,7 +2584,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    v_mov_b32_e32 v3, 42
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2394,6 +2594,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -2408,7 +2609,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    v_mov_b32_e32 v3, 42
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2416,6 +2619,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -2430,6 +2634,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -2448,6 +2654,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 42
@@ -2504,6 +2714,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2521,6 +2734,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2538,6 +2754,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -2551,6 +2769,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2604,7 +2826,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v4, s3
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v3, s2
 ; CI-NEXT:    flat_store_dword v[3:4], v0
@@ -2623,7 +2848,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v4, s3
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-NEXT:    flat_store_dword v[3:4], v0
@@ -2696,10 +2924,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2717,10 +2948,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2738,7 +2972,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_ret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -2753,6 +2989,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -2791,12 +3031,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2814,12 +3057,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2837,7 +3083,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -2852,6 +3100,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -2893,12 +3145,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; CI-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2916,12 +3171,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2939,7 +3197,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -2954,6 +3214,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -2995,10 +3259,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3008,10 +3275,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3021,7 +3291,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -3033,6 +3305,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3067,12 +3343,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3082,12 +3361,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3097,7 +3379,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -3109,6 +3393,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3146,12 +3434,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    v_mov_b32_e32 v1, 0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3161,12 +3452,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3176,7 +3470,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -3188,6 +3484,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3226,6 +3526,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3252,6 +3555,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3278,12 +3584,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3297,6 +3605,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3355,6 +3667,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3373,6 +3688,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3391,12 +3709,14 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_inc_x2 v[3:4], v[1:2] offset:40
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3405,6 +3725,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3455,6 +3779,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
+; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
@@ -3462,6 +3787,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; CI-NEXT:    ds_inc_rtn_u32 v3, v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -3475,6 +3802,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
@@ -3482,6 +3810,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; VI-NEXT:    ds_inc_rtn_u32 v3, v1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 34efb089b72bf1..868b530e42a219 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3037,7 +3037,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     enable_mem_ordered = 0
 ; GPRIDX-NEXT:     enable_fwd_progress = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT:     user_sgpr_count = 10
+; GPRIDX-NEXT:     user_sgpr_count = 12
 ; GPRIDX-NEXT:     enable_trap_handler = 0
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -3052,7 +3052,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     enable_sgpr_queue_ptr = 0
 ; GPRIDX-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GPRIDX-NEXT:     enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_size = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -3069,7 +3069,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 13
+; GPRIDX-NEXT:     wavefront_sgpr_count = 15
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -3117,7 +3117,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
 ; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 2
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -3128,7 +3128,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     enable_mem_ordered = 0
 ; MOVREL-NEXT:     enable_fwd_progress = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT:     user_sgpr_count = 10
+; MOVREL-NEXT:     user_sgpr_count = 12
 ; MOVREL-NEXT:     enable_trap_handler = 0
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -3143,7 +3143,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     enable_sgpr_queue_ptr = 0
 ; MOVREL-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; MOVREL-NEXT:     enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; MOVREL-NEXT:     enable_sgpr_private_segment_size = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -3160,7 +3160,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 9
+; MOVREL-NEXT:     wavefront_sgpr_count = 22
 ; MOVREL-NEXT:     workitem_vgpr_count = 4
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
@@ -3178,21 +3178,24 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:  ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; MOVREL-NEXT:    s_load_dword s8, s[6:7], 0x8
+; MOVREL-NEXT:    s_add_i32 s10, s10, s15
+; MOVREL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; MOVREL-NEXT:    s_mov_b32 s4, 0
 ; MOVREL-NEXT:    s_mov_b32 s5, 0x40080000
-; MOVREL-NEXT:    s_mov_b32 s2, 0
-; MOVREL-NEXT:    s_mov_b32 s3, 0x40140000
 ; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 1
 ; MOVREL-NEXT:    s_cselect_b64 s[6:7], 2.0, 1.0
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 2
+; MOVREL-NEXT:    s_mov_b32 s2, 0
 ; MOVREL-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 3
+; MOVREL-NEXT:    s_mov_b32 s3, 0x40140000
 ; MOVREL-NEXT:    s_cselect_b64 s[4:5], 4.0, s[4:5]
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 4
 ; MOVREL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, s2
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s1
+; MOVREL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; MOVREL-NEXT:    v_mov_b32_e32 v1, s3
 ; MOVREL-NEXT:    v_mov_b32_e32 v2, s0
 ; MOVREL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -3220,7 +3223,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     enable_mem_ordered = 1
 ; GFX10-NEXT:     enable_fwd_progress = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT:     user_sgpr_count = 10
+; GFX10-NEXT:     user_sgpr_count = 12
 ; GFX10-NEXT:     enable_trap_handler = 0
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -3235,7 +3238,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     enable_sgpr_queue_ptr = 0
 ; GFX10-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GFX10-NEXT:     enable_sgpr_dispatch_id = 1
-; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; GFX10-NEXT:     enable_sgpr_private_segment_size = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4054,7 +4057,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_mem_ordered = 0
 ; GPRIDX-NEXT:     enable_fwd_progress = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT:     user_sgpr_count = 10
+; GPRIDX-NEXT:     user_sgpr_count = 12
 ; GPRIDX-NEXT:     enable_trap_handler = 0
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4069,7 +4072,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_sgpr_queue_ptr = 0
 ; GPRIDX-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GPRIDX-NEXT:     enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_size = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4086,7 +4089,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 12
+; GPRIDX-NEXT:     wavefront_sgpr_count = 14
 ; GPRIDX-NEXT:     workitem_vgpr_count = 2
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -4127,7 +4130,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
 ; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 0
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 2
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -4138,7 +4141,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_mem_ordered = 0
 ; MOVREL-NEXT:     enable_fwd_progress = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT:     user_sgpr_count = 10
+; MOVREL-NEXT:     user_sgpr_count = 12
 ; MOVREL-NEXT:     enable_trap_handler = 0
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4153,7 +4156,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_sgpr_queue_ptr = 0
 ; MOVREL-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; MOVREL-NEXT:     enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; MOVREL-NEXT:     enable_sgpr_private_segment_size = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4170,7 +4173,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 8
+; MOVREL-NEXT:     wavefront_sgpr_count = 22
 ; MOVREL-NEXT:     workitem_vgpr_count = 3
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
@@ -4188,6 +4191,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:  ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; MOVREL-NEXT:    s_add_i32 s10, s10, s15
+; MOVREL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; MOVREL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; MOVREL-NEXT:    s_cmp_eq_u32 s2, 1
 ; MOVREL-NEXT:    s_cselect_b32 s3, 2.0, 1.0
@@ -4223,7 +4229,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_mem_ordered = 1
 ; GFX10-NEXT:     enable_fwd_progress = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT:     user_sgpr_count = 10
+; GFX10-NEXT:     user_sgpr_count = 12
 ; GFX10-NEXT:     enable_trap_handler = 0
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4238,7 +4244,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_sgpr_queue_ptr = 0
 ; GFX10-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GFX10-NEXT:     enable_sgpr_dispatch_id = 1
-; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; GFX10-NEXT:     enable_sgpr_private_segment_size = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4401,7 +4407,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_mem_ordered = 0
 ; GPRIDX-NEXT:     enable_fwd_progress = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT:     user_sgpr_count = 10
+; GPRIDX-NEXT:     user_sgpr_count = 12
 ; GPRIDX-NEXT:     enable_trap_handler = 0
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4416,7 +4422,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_sgpr_queue_ptr = 0
 ; GPRIDX-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GPRIDX-NEXT:     enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_size = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4433,7 +4439,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 13
+; GPRIDX-NEXT:     wavefront_sgpr_count = 15
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -4477,7 +4483,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
 ; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 2
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -4488,7 +4494,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_mem_ordered = 0
 ; MOVREL-NEXT:     enable_fwd_progress = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT:     user_sgpr_count = 10
+; MOVREL-NEXT:     user_sgpr_count = 12
 ; MOVREL-NEXT:     enable_trap_handler = 0
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4503,7 +4509,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_sgpr_queue_ptr = 0
 ; MOVREL-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; MOVREL-NEXT:     enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; MOVREL-NEXT:     enable_sgpr_private_segment_size = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4520,7 +4526,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 9
+; MOVREL-NEXT:     wavefront_sgpr_count = 22
 ; MOVREL-NEXT:     workitem_vgpr_count = 4
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
@@ -4538,10 +4544,12 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:  ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_load_dword s8, s[6:7], 0x8
 ; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; MOVREL-NEXT:    s_add_i32 s10, s10, s15
+; MOVREL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; MOVREL-NEXT:    s_mov_b32 s2, 0
-; MOVREL-NEXT:    s_mov_b32 s3, 0x40080000
 ; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 1
+; MOVREL-NEXT:    s_mov_b32 s3, 0x40080000
 ; MOVREL-NEXT:    s_cselect_b64 s[4:5], 2.0, 1.0
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 2
 ; MOVREL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
@@ -4549,6 +4557,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:    s_cselect_b64 s[2:3], 4.0, s[2:3]
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, s2
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s1
+; MOVREL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; MOVREL-NEXT:    v_mov_b32_e32 v1, s3
 ; MOVREL-NEXT:    v_mov_b32_e32 v2, s0
 ; MOVREL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -4576,7 +4585,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_mem_ordered = 1
 ; GFX10-NEXT:     enable_fwd_progress = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT:     user_sgpr_count = 10
+; GFX10-NEXT:     user_sgpr_count = 12
 ; GFX10-NEXT:     enable_trap_handler = 0
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4591,7 +4600,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_sgpr_queue_ptr = 0
 ; GFX10-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GFX10-NEXT:     enable_sgpr_dispatch_id = 1
-; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 0
+; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 1
 ; GFX10-NEXT:     enable_sgpr_private_segment_size = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 80e9ae33d6d45e..70b889389ff998 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -10,11 +10,11 @@
 define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
 ; GFX8V4-LABEL: addrspacecast:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x40
-; GFX8V4-NEXT:    s_add_i32 s8, s8, s11
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x40
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_mov_b32 s4, s0
 ; GFX8V4-NEXT:    s_mov_b32 s5, s3
@@ -38,11 +38,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX8V5-LABEL: addrspacecast:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xc8
-; GFX8V5-NEXT:    s_add_i32 s6, s6, s9
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0xc8
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_mov_b32 s4, s0
 ; GFX8V5-NEXT:    s_mov_b32 s5, s2
@@ -65,9 +65,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
@@ -91,9 +91,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
@@ -127,6 +127,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x40
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
@@ -140,6 +143,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xcc
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b32 s0, 1, 0
@@ -183,6 +189,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x44
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
@@ -196,6 +205,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xc8
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b32 s0, 1, 0
@@ -279,7 +291,10 @@ define amdgpu_kernel void @llvm_debugtrap() {
 define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V4:       ; %bb.0:
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8V4-NEXT:    s_add_u32 s0, s8, 8
 ; GFX8V4-NEXT:    flat_load_ubyte v0, v[0:1] glc
@@ -305,6 +320,9 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V5:       ; %bb.0:
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8V5-NEXT:    s_add_u32 s0, s6, 8
 ; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V5-NEXT:    s_addc_u32 s1, s7, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index 4fcde0f2fc7cf1..7aa3b5bb109903 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -9,7 +9,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[20:23], s[6:7], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[24:25], s[6:7], 0x10
-; GCN-NEXT:    s_add_u32 s0, s0, s13
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v64, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
index c3938e673a6da6..6ed2df430998f8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
@@ -11,13 +11,16 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace(
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 4
 ; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-NEXT:    ds_read_b32 v2, v0
-; CHECK-NEXT:    v_mov_b32_e32 v3, 9
+; CHECK-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_add_u32 s0, s0, 4
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-NEXT:    v_mov_b32_e32 v3, 9
 ; CHECK-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x200
 ; CHECK-NEXT:    ds_write_b32 v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
index d7a82b415ff06c..3a90c3ee908033 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
@@ -6,6 +6,9 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0xa
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s2, 0
 ; GCN-NEXT:    s_cselect_b32 s2, 1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
index f3654fea486e0c..96fd14f52d13b0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
@@ -42,6 +42,9 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -59,6 +62,9 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -76,6 +82,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -85,6 +93,10 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ;
 ; GFX10-LABEL: s_trig_preop_f64:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    s_load_dword s2, s[6:7], 0x8
@@ -113,6 +125,9 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ; CI-LABEL: s_trig_preop_f64_imm:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
 ; CI-NEXT:    s_add_u32 s0, s0, 4
@@ -128,6 +143,9 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ; VI-LABEL: s_trig_preop_f64_imm:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
 ; VI-NEXT:    s_add_u32 s0, s0, 4
@@ -143,6 +161,8 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ; GFX9-LABEL: s_trig_preop_f64_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
 ; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
@@ -151,6 +171,10 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ;
 ; GFX10-LABEL: s_trig_preop_f64_imm:
 ; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_add_u32 s10, s10, s15
+; GFX10-NEXT:    s_addc_u32 s11, s11, 0
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 7d7f450e590faa..f0ec0d101f5bec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -7,6 +7,9 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: sdivrem_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_ashr_i32 s8, s5, 31
 ; GFX8-NEXT:    s_add_i32 s0, s5, s8
@@ -145,6 +148,9 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
 ; GFX8-LABEL: sdivrem_i64:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
@@ -616,6 +622,9 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
 ; GFX8-LABEL: sdivrem_v2i32:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_ashr_i32 s2, s10, 31
@@ -845,6 +854,9 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
 ; GFX8-LABEL: sdivrem_v4i32:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1271,6 +1283,9 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
 ; GFX8-LABEL: sdivrem_v2i64:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x20
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2188,6 +2203,9 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
 ; GFX8-LABEL: sdiv_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s4, 0x80008
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
@@ -2333,6 +2351,9 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: sdivrem_v2i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s2, 0x80010
 ; GFX8-NEXT:    s_ashr_i32 s3, s0, 31
@@ -2597,6 +2618,9 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
 ; GFX8-LABEL: sdiv_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s4, 0x100010
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
@@ -2742,6 +2766,9 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-LABEL: sdivrem_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s3
 ; GFX8-NEXT:    s_ashr_i32 s8, s0, 31
@@ -3003,6 +3030,9 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
 ; GFX8-LABEL: sdivrem_i3:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s4, 0x30008
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
@@ -3154,6 +3184,9 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: sdivrem_i27:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s5, 0x1b0000
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 5aef6679347094..2be04ace99e36d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -7,6 +7,9 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: udivrem_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s5
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s5
@@ -112,6 +115,9 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
 ; GFX8-LABEL: udivrem_i64:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s11
@@ -522,6 +528,9 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
 ; GFX8-LABEL: udivrem_v2i32:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s10
@@ -685,6 +694,9 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
 ; GFX8-LABEL: udivrem_v4i32:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -979,7 +991,10 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
 ; GFX8-LABEL: udivrem_v2i64:
 ; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
 ; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[6:7], 0x20
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s13
@@ -1772,6 +1787,9 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
 ; GFX8-LABEL: udiv_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x80008
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
@@ -1885,6 +1903,9 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s2, s0, 0x80010
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
@@ -2081,6 +2102,9 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
 ; GFX8-LABEL: udiv_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s5
@@ -2193,7 +2217,10 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-LABEL: udivrem_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_and_b32 s2, s1, 0xffff
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
@@ -2387,6 +2414,9 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
 ; GFX8-LABEL: udivrem_i3:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x30008
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
@@ -2505,6 +2535,9 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: udivrem_i27:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_and_b32 s5, s5, 0x7ffffff
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s5
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index e53653408feb40..b8ffa4f14c3e5c 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -135,6 +135,9 @@ define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr)
 ; FIXEDABI-LABEL: marked_kernel_use_workitem_id:
 ; FIXEDABI:       ; %bb.0:
 ; FIXEDABI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; FIXEDABI-NEXT:    s_add_i32 s6, s6, s11
+; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v4, s1
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s0
@@ -181,16 +184,19 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr)
 ; FIXEDABI-LABEL: marked_kernel_use_workgroup_id:
 ; FIXEDABI:       ; %bb.0:
 ; FIXEDABI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s6
+; FIXEDABI-NEXT:    s_add_i32 s6, s6, s11
+; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
+; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s8
 ; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v0, s0
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v1, s1
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v2
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s7
+; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s9
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v2
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s8
+; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s10
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v2
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
 ; FIXEDABI-NEXT:    s_endpgm
@@ -238,6 +244,9 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 ; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
 ; FIXEDABI:       ; %bb.0:
+; FIXEDABI-NEXT:    s_add_i32 s6, s6, s11
+; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; FIXEDABI-NEXT:    s_add_u32 s0, s4, 8
 ; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; FIXEDABI-NEXT:    s_addc_u32 s1, s5, 0
@@ -261,7 +270,10 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #
 define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 {
 ; FIXEDABI-LABEL: marked_kernel_nokernargs_implicitarg_ptr:
 ; FIXEDABI:       ; %bb.0:
+; FIXEDABI-NEXT:    s_add_i32 s4, s4, s9
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v0, 0
+; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v1, 0
 ; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; FIXEDABI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 4b1484e9bd958e..42494efe27a824 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -44,7 +44,7 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
 ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
 ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
 ; SI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
+; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
 ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; HSA: buffer_store_dword [[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
@@ -226,8 +226,8 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
 ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
 ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
 ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
-; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
+; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
+; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
 ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: buffer_store_dword v[[K]], off, s[[[BASELO]]:[[RSRCHI]]], 0
@@ -251,8 +251,8 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
 
 ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
 ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
-; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
+; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
+; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
 ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
index 0a461f9ee6c968..e0c69706bad79c 100644
--- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
@@ -8,8 +8,10 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
 ; GCN-LABEL: readfirstlane_uniform:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NEXT:    s_mov_b32 s5, 0
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s0, s0, s4
@@ -18,6 +20,7 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
 ; GCN-NEXT:    s_add_u32 s0, s2, 40
 ; GCN-NEXT:    s_addc_u32 s1, s3, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index bf72cccd912cee..89c5303e0e81d1 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -393,11 +393,14 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
 ; GCN-LABEL: select_add_lhs_const_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x83
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x80
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bitcmp1_b32 s0, 0
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    flat_store_short v[0:1], v0
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
index a6d8c6f41eee59..54a800ecee9f17 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE
 
 ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
-; TRAP-HANDLER-ENABLE:  NumSgprs: 77
-; TRAP-HANDLER-DISABLE: NumSgprs: 92
+; TRAP-HANDLER-ENABLE:  NumSgprs: 83
+; TRAP-HANDLER-DISABLE: NumSgprs: 98
 define amdgpu_kernel void @amdhsa_trap_num_sgprs(
     ptr addrspace(1) %out0, i32 %in0,
     ptr addrspace(1) %out1, i32 %in1,
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index 23294eacbe6cb3..ab2e28e5f5cbb5 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -751,7 +751,7 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @use_dispatch_ptr_ret_type()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index b153069fbee16b..aabda171eecb22 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -483,7 +483,7 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
+; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR13]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) [[PTR]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -500,7 +500,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] {
+; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR13]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[PTR]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index fc13b86566f76c..22cc5af30da662 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -35,9 +35,9 @@ entry:
 attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
 
 ; CHECK-LABEL: {{^}}min_1024_max_1024
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 10
-; CHECK: NumSGPRsForWavesPerEU: 2{{$}}
+; CHECK: NumSGPRsForWavesPerEU: 24{{$}}
 ; CHECK: NumVGPRsForWavesPerEU: 43
 @var = addrspace(1) global float 0.0
 define amdgpu_kernel void @min_1024_max_1024() #3 {
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index ed045107d354dc..3ddf8be052e4ab 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -4,8 +4,8 @@
 
 ; ALL-LABEL: {{^}}max_10_sgprs:
 
-; ALL: SGPRBlocks: 1
-; ALL: NumSGPRsForWavesPerEU: 10
+; ALL: SGPRBlocks: 2
+; ALL: NumSGPRsForWavesPerEU: 22
 define amdgpu_kernel void @max_10_sgprs() #0 {
   %one = load volatile i32, ptr addrspace(4) undef
   %two = load volatile i32, ptr addrspace(4) undef
@@ -125,7 +125,7 @@ declare i64 @llvm.amdgcn.dispatch.id() #1
 declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #1
 declare ptr addrspace(4) @llvm.amdgcn.queue.ptr() #1
 
-attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }
+attributes #0 = { nounwind "amdgpu-num-sgpr"="18" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind "amdgpu-num-sgpr"="12" }
 attributes #3 = { nounwind "amdgpu-num-sgpr"="11" }
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 14519f5a5e77c0..26dc3e13a72cf0 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -116,9 +116,9 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"}
 
 ; Exactly 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}exactly_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 3
 ; CHECK: VGPRBlocks: 5
-; CHECK: NumSGPRsForWavesPerEU: 20
+; CHECK: NumSGPRsForWavesPerEU: 26
 ; CHECK: NumVGPRsForWavesPerEU: 24
 define amdgpu_kernel void @exactly_10() #9 {
   %val0 = load volatile float, ptr addrspace(1) @var
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
index ce5a3eedb5ebb1..f9f48aacfc2bf7 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
@@ -1,5 +1,5 @@
 ; Test the generation of the attribute amdgpu-no-flat-scratch-init
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -global-isel -stop-after=irtranslator < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -global-isel -stop-after=irtranslator | FileCheck -check-prefixes=GFX10 %s
 
 ;; tests of alloca
 
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index c0d700cc37464b..6d9da9281211cb 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -1,6 +1,6 @@
 ; Test the generation of the attribute amdgpu-no-flat-scratch-init
-; RUN: llc -march=amdgcn -mcpu=gfx900 -stop-after=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -stop-after=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
 
 ;; tests of alloca
 
@@ -200,10 +200,10 @@ define amdgpu_kernel void @call_with_alloca_cc_kernel() {
 
 define void @call_both_with_and_without_alloca() {
 ; GFX9-LABEL: define void @call_both_with_and_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_both_with_and_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
   call void @with_alloca()
   call void @without_alloca()
   ret void
@@ -211,10 +211,10 @@ define void @call_both_with_and_without_alloca() {
 
 define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
   call void @with_alloca()
   call void @without_alloca()
   ret void
@@ -235,7 +235,7 @@ define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
 ; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
   call void @call_without_alloca()
   ret void
 }
@@ -262,10 +262,10 @@ define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
 
 define void @with_alloca_call_without_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @without_alloca()
@@ -274,10 +274,10 @@ define void @with_alloca_call_without_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @without_alloca()
@@ -310,10 +310,10 @@ define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
 
 define void @with_alloca_call_call_without_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_without_alloca()
@@ -322,10 +322,10 @@ define void @with_alloca_call_call_without_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_without_alloca()
@@ -360,30 +360,30 @@ define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
 
 define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
 
 define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   %stof = addrspacecast ptr addrspace(1) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -391,10 +391,10 @@ define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 
 define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   %stof = addrspacecast ptr addrspace(1) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -402,30 +402,30 @@ define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrs
 
 define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
   store volatile i32 0, ptr addrspace(2) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   store volatile i32 0, ptr addrspace(2) %ptr
   ret void
 }
 
 define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   %stof = addrspacecast ptr addrspace(2) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -433,10 +433,10 @@ define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 
 define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   %stof = addrspacecast ptr addrspace(2) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -444,30 +444,30 @@ define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrs
 
 define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
   store volatile i32 0, ptr addrspace(3) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   store volatile i32 0, ptr addrspace(3) %ptr
   ret void
 }
 
 define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -475,10 +475,10 @@ define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 
 define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -486,20 +486,20 @@ define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrsp
 
 define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI5:[0-9]+]]
   store volatile i32 0, ptr addrspace(4) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI6:[0-9]+]]
   store volatile i32 0, ptr addrspace(4) %ptr
   ret void
 }
@@ -528,30 +528,30 @@ define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr add
 
 define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
   store volatile i32 0, ptr addrspace(5) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   store volatile i32 0, ptr addrspace(5) %ptr
   ret void
 }
 
 define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -559,10 +559,10 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 
 define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -570,50 +570,50 @@ define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addr
 
 define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -621,10 +621,10 @@ define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrsp
 
 define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -632,70 +632,78 @@ define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacec
 
 define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
   call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
+
+
+
+
+
+
+
+
 define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -704,10 +712,10 @@ define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(
 
 define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -716,10 +724,10 @@ define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_
 
 define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -728,10 +736,10 @@ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5)
 
 define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -764,10 +772,10 @@ define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kern
 
 define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
   call void @without_alloca(i1 true)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -775,10 +783,10 @@ define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
 
 define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
   call void @without_alloca(i1 true)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -834,10 +842,10 @@ declare i32 @llvm.amdgcn.workgroup.id.x()
 
 define void @use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI7:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI7:[0-9]+]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %val, ptr addrspace(1) undef
   ret void
@@ -856,59 +864,84 @@ define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
 
 define void @call_use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI7:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI7:[0-9]+]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
 define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-;GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI2]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI3]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI4]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI3]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI4]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI5]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI6]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI5]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI6]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
+; GFX9:  attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 ; GFX9:  attributes #[[ATTR_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI7]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI2]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI3]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_IND_CALL]]  = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI4]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI3]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI4]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI5]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI6]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI5]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI6]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI7]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 810a227152d2a9..6af45035d394f8 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -33,8 +33,8 @@ define void @indirect_use_vcc() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_2level_use_vcc_kernel:
-; CI: ; NumSgprs: 36
-; VI-NOBUG: ; NumSgprs: 36
+; CI: ; NumSgprs: 38
+; VI-NOBUG: ; NumSgprs: 40
 ; VI-BUG: ; NumSgprs: 96
 ; GCN: ; NumVgprs: 41
 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 {
@@ -121,8 +121,8 @@ define void @indirect_use_80_sgpr() #1 {
 }
 
 ; GCN-LABEL: {{^}}indirect_2_level_use_80_sgpr:
-; CI: ; NumSgprs: 82
-; VI-NOBUG: ; NumSgprs: 82
+; CI: ; NumSgprs: 84
+; VI-NOBUG: ; NumSgprs: 86
 ; VI-BUG: ; NumSgprs: 96
 define amdgpu_kernel void @indirect_2_level_use_80_sgpr() #0 {
   call void @indirect_use_80_sgpr()
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index 3035a8579c8a6d..48ad2fe687804c 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -12,13 +12,13 @@
 ; OSABI-AMDHSA-ASM: .section .rodata,"a"
 ; OSABI-AMDHSA-ASM: .p2align 6
 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fadd
-; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 10
+; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 12
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 8
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
+; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 1
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
 ; OSABI-AMDHSA-ASM: .text
 
@@ -31,13 +31,13 @@
 ; OSABI-AMDHSA-ASM: .section .rodata,"a"
 ; OSABI-AMDHSA-ASM: .p2align 6
 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fsub
-; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 10
+; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 12
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 8
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
+; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 1
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
 ; OSABI-AMDHSA-ASM: .text
 
diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
index 5fbcd0bf669995..c97d3338006022 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
@@ -5,6 +5,9 @@
 define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 {
 ; CHECK-LABEL: _Z11test_kernelPii:
 ; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 3
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
index 297fe7618672e6..f586f6d16e0efd 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
@@ -6,6 +6,8 @@
 define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) {
 ; CHECK-LABEL: eggs:
 ; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[6:7], 0x8
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
index 85ed2914b8c7f5..39328d706ba260 100644
--- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
@@ -9,7 +9,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dword s14, s[6:7], 0x4
-; CHECK-NEXT:    s_add_u32 s24, s24, s13
+; CHECK-NEXT:    s_add_u32 s24, s24, s15
 ; CHECK-NEXT:    s_addc_u32 s25, s25, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_bitcmp1_b32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index 54fb1dc5c05274..05a245cd3443c4 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -94,6 +94,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-LABEL: s_add_co_br_user:
 ; GFX7:       ; %bb.0: ; %bb
 ; GFX7-NEXT:    s_load_dword s2, s[6:7], 0x0
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_add_i32 s0, s2, s2
 ; GFX7-NEXT:    s_cmp_lt_u32 s0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 21799ab79b8396..7fde702bd49ac8 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -13,6 +13,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -25,6 +28,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -68,6 +74,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -80,6 +89,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -122,6 +134,9 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -134,6 +149,9 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -175,6 +193,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; CI-LABEL: s_fabs_v4f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
@@ -188,6 +209,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; VI-LABEL: s_fabs_v4f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
@@ -232,6 +256,9 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
 ; CI-LABEL: fabs_fold_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -249,6 +276,9 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
@@ -295,6 +325,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -309,6 +342,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -358,6 +394,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -370,6 +409,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -415,6 +457,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
@@ -441,6 +486,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -499,9 +547,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dword v0, v[0:1]
 ; CI-NEXT:    s_lshr_b32 s2, s4, 16
@@ -527,9 +578,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -589,6 +643,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -612,6 +669,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -682,6 +742,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -700,6 +763,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 7252c69cb1cf75..4df6b8d0669158 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -74,6 +74,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align2:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
+; GFX7-ALIGNED-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-ALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-ALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -90,6 +93,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
 ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
+; GFX7-UNALIGNED-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-UNALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-UNALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -220,8 +226,10 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
+; GFX7-ALIGNED-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-ALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-ALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 1
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_add_u32 s2, s0, 2
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -231,6 +239,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v2
 ; GFX7-ALIGNED-NEXT:    s_addc_u32 s5, s1, 0
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-ALIGNED-NEXT:    s_add_u32 s0, s0, 3
 ; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v3
@@ -247,6 +256,9 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
+; GFX7-UNALIGNED-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-UNALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-UNALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -359,6 +371,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align4:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
+; GFX7-ALIGNED-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-ALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-ALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -369,6 +384,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
 ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
+; GFX7-UNALIGNED-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-UNALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-UNALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index f0ce96af90649d..6482749bd2fb7e 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -24,6 +24,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1
 ; GFX678-LABEL: v_test_canonicalize_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -80,6 +83,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
@@ -91,6 +97,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -140,6 +149,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fabs_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -196,6 +208,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1
 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -253,6 +268,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fneg_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -309,6 +327,9 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou
 ; GFX678-LABEL: test_fold_canonicalize_undef_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -352,6 +373,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -395,6 +419,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -441,6 +468,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 1.0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -485,6 +515,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, -1.0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -529,6 +562,9 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %
 ; GFX678-LABEL: test_fold_canonicalize_literal_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x41800000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -573,6 +609,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -616,10 +655,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
+; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX678-NEXT:    flat_store_dword v[0:1], v2
 ; GFX678-NEXT:    s_endpgm
@@ -664,10 +706,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
+; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX678-NEXT:    flat_store_dword v[0:1], v2
 ; GFX678-NEXT:    s_endpgm
@@ -712,10 +757,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
+; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX678-NEXT:    flat_store_dword v[0:1], v2
 ; GFX678-NEXT:    s_endpgm
@@ -760,6 +808,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -804,6 +855,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -850,6 +904,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x807fffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -894,6 +951,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out
 ; GFX678-LABEL: test_fold_canonicalize_qnan_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -938,6 +998,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -982,6 +1045,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1026,6 +1092,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1070,6 +1139,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1114,6 +1186,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1158,6 +1233,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1202,6 +1280,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1
 ; GFX678-LABEL: v_test_canonicalize_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1257,6 +1338,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
 ; GFX6-LABEL: s_test_canonicalize_var_f64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_max_f64 v[2:3], s[2:3], s[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
@@ -1267,6 +1351,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
 ; GFX8-LABEL: s_test_canonicalize_var_f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], s[2:3], s[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
@@ -1313,6 +1400,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fabs_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1369,6 +1459,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1
 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1426,6 +1519,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fneg_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1482,10 +1578,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, v0
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1531,10 +1630,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1578,10 +1680,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1623,10 +1728,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0xbff00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0xbff00000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1668,10 +1776,13 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %
 ; GFX678-LABEL: test_fold_canonicalize_literal_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x40300000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x40300000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1713,10 +1824,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, v0
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1762,10 +1876,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, -1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0xfffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0xfffff
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1810,10 +1927,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1857,10 +1977,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, -1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x800fffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x800fffff
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1905,10 +2028,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out
 ; GFX678-LABEL: test_fold_canonicalize_qnan_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1950,10 +2076,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1995,10 +2124,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2040,10 +2172,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2085,10 +2220,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2130,10 +2268,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2175,10 +2316,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX678-NEXT:    s_add_i32 s10, s10, s15
+; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2221,6 +2365,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2238,6 +2385,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2305,6 +2455,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2322,6 +2475,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2389,6 +2545,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2407,6 +2566,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2475,6 +2637,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2498,6 +2663,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2568,6 +2736,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2585,6 +2756,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2652,6 +2826,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2669,6 +2846,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2737,6 +2917,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2755,6 +2938,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2824,6 +3010,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2847,6 +3036,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2916,6 +3108,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX6-NEXT:    s_add_i32 s10, s10, s15
+; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -2933,6 +3128,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
index fee6540f43c64c..ab00b132f2fd1a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -6,18 +6,13 @@
 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga | FileCheck -check-prefix=NOHSA-NOADDR64 -check-prefix=ALL %s
 
 
-; There are no stack objects even though flat is used by default, so
-; flat_scratch_init should be disabled.
-
 ; ALL-LABEL: {{^}}test:
 
-; ALL-NOT: flat_scr
-
 ; HSA-DEFAULT: flat_store_dword
 ; HSA-NODEFAULT: buffer_store_dword
 ; HSA-NOADDR64: flat_store_dword
 
-; HSA: .amdhsa_user_sgpr_flat_scratch_init 0
+; HSA: .amdhsa_user_sgpr_flat_scratch_init 1
 
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index e4ffedd686ac93..134b7902380862 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefix=CI -check-prefix=GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefix=FIJI-NOXNACK -check-prefix=GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,GCN %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,GCN %s
@@ -8,16 +8,16 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,HSA-VI-NOXNACK,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-XNACK,HSA-VI-XNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX9-ARCH-FLAT,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX9-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX9-ARCH-FLAT-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX9-ARCH-FLAT-XNACK,GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX10-ARCH-FLAT,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX10-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX10-ARCH-FLAT-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX10-ARCH-FLAT-XNACK,GCN %s
 
 ; GCN-LABEL: {{^}}no_vcc_no_flat:
 
@@ -28,8 +28,12 @@
 ; CI: ; NumSgprs: 8
 ; VI-NOXNACK: ; NumSgprs: 8
 ; VI-XNACK: ; NumSgprs: 12
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 8
+; HSA-VI-NOXNACK: ; NumSgprs: 8
+; HSA-VI-XNACK: ; NumSgprs: 12
+; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
+; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
+; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 8
+; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 8
 define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7}"()
@@ -45,8 +49,12 @@ entry:
 ; CI: ; NumSgprs: 10
 ; VI-NOXNACK: ; NumSgprs: 10
 ; VI-XNACK: ; NumSgprs: 12
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 10
+; HSA-VI-NOXNACK: ; NumSgprs: 10
+; HSA-VI-XNACK: ; NumSgprs: 12
+; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 10
 define amdgpu_kernel void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -60,10 +68,15 @@ entry:
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
 ; CI: ; NumSgprs: 12
+; FIJI-NOXNACK: ; NumSgprs: 14
 ; VI-NOXNACK: ; NumSgprs: 14
 ; VI-XNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 8
+; HSA-VI-NOXNACK: ; NumSgprs: 24
+; HSA-VI-XNACK: ; NumSgprs: 24
+; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 8
+; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 8
 define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -79,8 +92,10 @@ entry:
 ; CI: ; NumSgprs: 12
 ; VI-NOXNACK: ; NumSgprs: 14
 ; VI-XNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT: ; NumSgprs: 14
-; GFX10-ARCH-FLAT: ; NumSgprs: 10
+; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 10
 define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -99,8 +114,10 @@ entry:
 ; CI: NumSgprs: 4
 ; VI-NOXNACK: NumSgprs: 6
 ; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; NumSgprs: 6
-; GFX10-ARCH-FLAT: ; NumSgprs: 0
+; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0
 define amdgpu_kernel void @use_flat_scr() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -116,8 +133,10 @@ entry:
 ; CI: NumSgprs: 4
 ; VI-NOXNACK: NumSgprs: 6
 ; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; NumSgprs: 6
-; GFX10-ARCH-FLAT: ; NumSgprs: 0
+; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_lo() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -133,8 +152,10 @@ entry:
 ; CI: NumSgprs: 4
 ; VI-NOXNACK: NumSgprs: 6
 ; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; NumSgprs: 6
-; GFX10-ARCH-FLAT: ; NumSgprs: 0
+; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_hi() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index c60b9858abd836..23b453438f3a0f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -16,6 +16,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
 ; VI-LABEL: multiple_fadd_use_test_f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_add_f32_e64 v0, s3, -1.0
 ; VI-NEXT:    v_add_f32_e64 v1, s2, -1.0
@@ -82,8 +85,11 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x2c
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_add_u32 s2, s0, 4
 ; VI-NEXT:    v_add_f32_e64 v2, s4, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -143,6 +149,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo
 ; VI-LABEL: multiple_use_fadd_fmad_f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_add_u32 s4, s0, 4
@@ -200,6 +209,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s6, s4, 4
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
@@ -263,6 +275,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e64 v0, s2, -4.0
 ; VI-NEXT:    v_mul_f32_e32 v2, s2, v0
@@ -313,10 +328,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0xc0c00000
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e32 v0, s2, v0
 ; VI-NEXT:    v_mul_f32_e32 v2, s2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -362,6 +380,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
 ; VI-DENORM:       ; %bb.0:
 ; VI-DENORM-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
+; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-DENORM-NEXT:    v_add_f16_e64 v0, s2, -1.0
@@ -380,6 +401,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
 ; VI-FLUSH:       ; %bb.0:
 ; VI-FLUSH-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
+; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-FLUSH-NEXT:    v_add_f16_e64 v0, s2, -1.0
@@ -498,6 +522,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
 ; VI-DENORM:       ; %bb.0:
 ; VI-DENORM-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
+; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s3
@@ -519,6 +546,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
 ; VI-FLUSH:       ; %bb.0:
 ; VI-FLUSH-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
+; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
@@ -619,6 +649,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
 ; VI-DENORM:       ; %bb.0:
 ; VI-DENORM-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
+; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s3
@@ -640,6 +673,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
 ; VI-FLUSH:       ; %bb.0:
 ; VI-FLUSH-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
+; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s3
@@ -742,6 +778,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; VI-DENORM-NEXT:    s_load_dword s6, s[6:7], 0x8
+; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
+; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s0, s0, 16
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s0
@@ -749,6 +787,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-DENORM-NEXT:    v_fma_f16 v3, |s6|, 2.0, v0
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s2
+; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-DENORM-NEXT:    s_add_u32 s4, s2, 2
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-DENORM-NEXT:    s_addc_u32 s5, s3, 0
@@ -765,6 +804,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; VI-FLUSH-NEXT:    s_load_dword s6, s[6:7], 0x8
+; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
+; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s0, s0, 16
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
@@ -772,6 +813,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-FLUSH-NEXT:    v_mad_f16 v3, |s6|, 2.0, v0
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s2
+; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-FLUSH-NEXT:    s_add_u32 s4, s2, 2
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-FLUSH-NEXT:    s_addc_u32 s5, s3, 0
@@ -875,6 +917,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e64 v0, s2, -4.0
 ; VI-NEXT:    v_mul_f16_e32 v2, s2, v0
@@ -928,10 +973,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0xc600
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e32 v0, s2, v0
 ; VI-NEXT:    v_mul_f16_e32 v2, s2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 8267bb9f5450f8..c4f13749251a4c 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -8,6 +8,9 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
 ; CI-LABEL: fneg_fabs_fadd_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -25,6 +28,9 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -71,6 +77,9 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
 ; CI-LABEL: fneg_fabs_fmul_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s1, s0, 0x7fff
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -89,6 +98,9 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -139,6 +151,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_bitset1_b32 s2, 15
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -151,6 +166,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitset1_b32 s2, 15
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -195,6 +213,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_bitset1_b32 s2, 15
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -207,6 +228,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitset1_b32 s2, 15
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -249,6 +273,9 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(
 ; CIVI-LABEL: v_fneg_fabs_f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -294,6 +321,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
 ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s1, s0, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, s1
@@ -316,7 +346,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x4000
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v2, s3
@@ -325,6 +357,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -369,6 +402,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_or_b32 s2, s2, 0x80008000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -381,6 +417,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_or_b32 s2, s2, 0x80008000
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -423,6 +462,9 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
 ; CIVI-LABEL: fneg_fabs_v4f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_or_b32 s3, s3, 0x80008000
 ; CIVI-NEXT:    s_or_b32 s2, s2, 0x80008000
@@ -468,6 +510,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
 ; CI-LABEL: fold_user_fneg_fabs_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s1, s0, 16
 ; CI-NEXT:    v_cvt_f32_f16_e64 v1, |s1|
@@ -489,7 +534,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 0xc400
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v2, s3
@@ -497,6 +544,7 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
 ; VI-NEXT:    v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v2, v1, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -538,6 +586,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    s_and_b32 s0, s4, 0x7fff7fff
@@ -555,6 +606,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_and_b32 s0, s4, 0x7fff7fff
@@ -611,6 +665,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    s_bfe_u32 s0, s4, 0xf0010
@@ -635,7 +692,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xc400
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_lshr_b32 s1, s4, 16
@@ -644,6 +703,7 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
 ; VI-NEXT:    s_and_b32 s0, s4, 0x7fff7fff
 ; VI-NEXT:    v_mul_f16_sdwa v4, |v4|, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_mul_f16_e64 v5, |s4|, -4.0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; VI-NEXT:    v_mov_b32_e32 v5, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index 63ccaafeda88f4..e9362ec9b5322f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1478,6 +1478,8 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x4
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x6
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bitcmp1_b32 s8, 0
 ; GFX7-NEXT:    s_cselect_b64 vcc, -1, 0
@@ -1489,6 +1491,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX7-NEXT:    s_cselect_b32 s0, s0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, -v0, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index 40982347f3ca00..6e22c9c319f698 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -10,6 +10,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -22,6 +25,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -66,6 +72,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -80,6 +89,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -129,6 +141,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -141,6 +156,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -183,6 +201,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
 ; CI-LABEL: v_fneg_fold_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -200,6 +221,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8-LABEL: v_fneg_fold_f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -246,6 +270,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -258,6 +285,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -299,14 +329,17 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 {
 ; CIVI-LABEL: s_fneg_v2f16_nonload:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
 ; CIVI-NEXT:    ;;#ASMSTART
 ; CIVI-NEXT:    ; def s2
 ; CIVI-NEXT:    ;;#ASMEND
 ; CIVI-NEXT:    s_xor_b32 s2, s2, 0x80008000
-; CIVI-NEXT:    v_mov_b32_e32 v2, s2
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
+; CIVI-NEXT:    v_mov_b32_e32 v2, s2
 ; CIVI-NEXT:    flat_store_dword v[0:1], v2
 ; CIVI-NEXT:    s_endpgm
 ;
@@ -349,6 +382,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -363,6 +399,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -412,6 +451,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -424,6 +466,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -466,6 +511,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; CI-LABEL: v_fneg_fold_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -492,6 +540,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; GFX8-LABEL: v_fneg_fold_v2f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -539,6 +590,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; CI-LABEL: v_extract_fneg_fold_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -560,6 +614,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; GFX8-LABEL: v_extract_fneg_fold_v2f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -624,6 +681,9 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0
 ; CIVI-LABEL: v_extract_fneg_no_fold_v2f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 3735c6349fbb31..6207c442f41eeb 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -10,6 +10,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -21,6 +24,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -48,6 +54,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -59,6 +68,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -85,6 +97,9 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
 ; CIVI-LABEL: load_v3f16_arg:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_add_u32 s4, s0, 4
 ; CIVI-NEXT:    s_addc_u32 s5, s1, 0
@@ -120,6 +135,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
 ; CIVI-LABEL: load_v4f16_arg:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v2, s2
@@ -147,6 +165,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v4, s4
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -161,6 +182,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -193,6 +217,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s3, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -206,6 +233,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -239,6 +269,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, s2
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -250,6 +283,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -279,6 +315,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s3, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -292,6 +331,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -324,6 +366,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
 ; CI-LABEL: extload_v3f16_to_v3f32_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, s3
@@ -337,6 +382,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
 ; VI-LABEL: extload_v3f16_to_v3f32_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s2, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
@@ -369,6 +417,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ; CI-LABEL: extload_v4f16_to_v4f32_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s3, 16
 ; CI-NEXT:    s_lshr_b32 s5, s2, 16
@@ -384,6 +435,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ; VI-LABEL: extload_v4f16_to_v4f32_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s3, 16
 ; VI-NEXT:    s_lshr_b32 s5, s2, 16
@@ -421,6 +475,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s6, s1, 16
 ; CI-NEXT:    s_lshr_b32 s7, s0, 16
@@ -449,6 +506,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s6, s1, 16
 ; VI-NEXT:    s_lshr_b32 s7, s0, 16
@@ -507,6 +567,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
 ; CI-LABEL: extload_f16_to_f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -520,6 +583,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
 ; VI-LABEL: extload_f16_to_f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s0, s[6:7], 0x8
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -553,6 +619,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
 ; CI-LABEL: extload_v2f16_to_v2f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s1, s0, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s1
@@ -569,6 +638,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
 ; VI-LABEL: extload_v2f16_to_v2f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s0, s[6:7], 0x8
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s1, s0, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s1
@@ -608,6 +680,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
 ; CI-LABEL: extload_v3f16_to_v3f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s3
 ; CI-NEXT:    s_lshr_b32 s4, s2, 16
@@ -629,6 +704,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
 ; VI-LABEL: extload_v3f16_to_v3f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
 ; VI-NEXT:    s_lshr_b32 s4, s2, 16
@@ -676,6 +754,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
 ; CI-LABEL: extload_v4f16_to_v4f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s3, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s3
@@ -701,6 +782,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
 ; VI-LABEL: extload_v4f16_to_v4f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s5, s3, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s3
@@ -756,6 +840,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s6, s3, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s6
@@ -803,6 +890,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s6, s0, 16
 ; VI-NEXT:    s_lshr_b32 s8, s2, 16
@@ -890,6 +980,9 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr
 ; CIVI-LABEL: global_load_store_f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -920,6 +1013,9 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: global_load_store_v2f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -950,6 +1046,9 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add
 ; CIVI-LABEL: global_load_store_v4f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
@@ -980,6 +1079,9 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: global_load_store_v8f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1010,6 +1112,9 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr
 ; CIVI-LABEL: global_extload_f16_to_f32:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1043,6 +1148,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v2f16_to_v2f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1059,6 +1167,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v2f16_to_v2f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1096,6 +1207,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v3f16_to_v3f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1113,6 +1227,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v3f16_to_v3f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1152,6 +1269,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v4f16_to_v4f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1171,6 +1291,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v4f16_to_v4f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1213,6 +1336,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v8f16_to_v8f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1243,6 +1369,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v8f16_to_v8f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1301,6 +1430,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
 ; CI-LABEL: global_extload_v16f16_to_v16f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s4, s2, 16
 ; CI-NEXT:    s_addc_u32 s5, s3, 0
@@ -1359,6 +1491,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
 ; VI-LABEL: global_extload_v16f16_to_v16f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1458,6 +1593,9 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr
 ; CIVI-LABEL: global_extload_f16_to_f64:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1494,6 +1632,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v2f16_to_v2f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1512,6 +1653,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v2f16_to_v2f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1554,6 +1698,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v3f16_to_v3f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1579,6 +1726,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v3f16_to_v3f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1632,6 +1782,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v4f16_to_v4f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1660,6 +1813,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v4f16_to_v4f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1719,6 +1875,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v8f16_to_v8f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1767,6 +1926,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v8f16_to_v8f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1853,6 +2015,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
 ; CI-LABEL: global_extload_v16f16_to_v16f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1948,6 +2113,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
 ; VI-LABEL: global_extload_v16f16_to_v16f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2103,6 +2271,9 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p
 ; CIVI-LABEL: global_truncstore_f32_to_f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2136,6 +2307,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v2f32_to_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2153,6 +2327,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v2f32_to_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2191,6 +2368,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v3f32_to_v3f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2214,6 +2394,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v3f32_to_v3f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2261,6 +2444,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v4f32_to_v4f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2282,6 +2468,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v4f32_to_v4f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2326,6 +2515,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v8f32_to_v8f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2361,6 +2553,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v8f32_to_v8f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2426,6 +2621,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
 ; CI-LABEL: global_truncstore_v16f32_to_v16f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s4, s2, 32
 ; CI-NEXT:    s_addc_u32 s5, s3, 0
@@ -2495,6 +2693,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
 ; VI-LABEL: global_truncstore_v16f32_to_v16f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s4, s2, 32
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
@@ -2607,6 +2808,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
 ; CI-LABEL: fadd_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -2624,6 +2828,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
@@ -2656,6 +2863,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
 ; CI-LABEL: fadd_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
@@ -2677,6 +2887,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
 ; VI-LABEL: fadd_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s3, 16
 ; VI-NEXT:    s_lshr_b32 s5, s2, 16
@@ -2710,6 +2923,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-LABEL: fadd_v4f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2747,6 +2963,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-LABEL: fadd_v4f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2787,6 +3006,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x half> %b) #0 {
 ; CI-LABEL: fadd_v8f16:
 ; CI:       ; %bb.0:
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2845,6 +3067,9 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x
 ;
 ; VI-LABEL: fadd_v8f16:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2909,6 +3134,9 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr
 ; CIVI-LABEL: test_bitcast_from_half:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2940,6 +3168,9 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs
 ; CIVI-LABEL: test_bitcast_to_half:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 8c017fa5ec2636..51b0e2b86cdf39 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -15,7 +15,10 @@
 ; CHECK:     .max_flat_workgroup_size: 1024
 ; CHECK:     .name:           test
 ; CHECK:     .private_segment_fixed_size: 0
-; CHECK:     .sgpr_count:     10
+; GFX700:     .sgpr_count:     22
+; GFX803:     .sgpr_count:     24
+; GFX900:     .sgpr_count:     10
+; GFX1010:    .sgpr_count:     10
 ; CHECK:     .symbol:         test.kd
 ; CHECK:     .vgpr_count:     {{3|6}}
 ; WAVE64:    .wavefront_size: 64
@@ -48,8 +51,8 @@ entry:
 
 ; CHECK:   .name:       num_spilled_sgprs
 ; GFX700:   .sgpr_spill_count: 10
-; GFX803:   .sgpr_spill_count: 10
-; GFX900:   .sgpr_spill_count: 62
+; GFX803:   .sgpr_spill_count: 0
+; GFX900:   .sgpr_spill_count: 0
 ; GFX1010:  .sgpr_spill_count: 60
 ; CHECK:   .symbol:     num_spilled_sgprs.kd
 define amdgpu_kernel void @num_spilled_sgprs(
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 37476203fbfad6..2c38e201d326f6 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -43,7 +43,7 @@
 ; ELF:   00E0: 6E616D65 A673696D 706C65BB 2E707269
 ; ELF:   00F0: 76617465 5F736567 6D656E74 5F666978
 ; ELF:   0100: 65645F73 697A6500 AB2E7367 70725F63
-; ELF:   0110: 6F756E74 06B12E73 6770725F 7370696C
+; ELF:   0110: 6F756E74 0EB12E73 6770725F 7370696C
 ; ELF:   0120: 6C5F636F 756E7400 A72E7379 6D626F6C
 ; ELF:   0130: A973696D 706C652E 6B64AB2E 76677072
 ; ELF:   0140: 5F636F75 6E7403B1 2E766770 725F7370
@@ -59,7 +59,7 @@
 ; ELF:   01E0: 73696D70 6C655F6E 6F5F6B65 726E6172
 ; ELF:   01F0: 6773BB2E 70726976 6174655F 7365676D
 ; ELF:   0200: 656E745F 66697865 645F7369 7A6500AB
-; ELF:   0210: 2E736770 725F636F 756E7400 B12E7367
+; ELF:   0210: 2E736770 725F636F 756E740C B12E7367
 ; ELF:   0220: 70725F73 70696C6C 5F636F75 6E7400A7
 ; ELF:   0230: 2E73796D 626F6CB5 73696D70 6C655F6E
 ; ELF:   0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index c998a4b19121e1..8f9b223c361d60 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -10,11 +10,11 @@
 define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addrspace(3) %ptr.local) {
 ; GFX8V4-LABEL: addrspacecast:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x40
-; GFX8V4-NEXT:    s_add_i32 s8, s8, s11
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s8, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s9
+; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x40
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V4-NEXT:    s_cselect_b32 s3, s3, 0
@@ -36,11 +36,11 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX8V5-LABEL: addrspacecast:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0xc8
-; GFX8V5-NEXT:    s_add_i32 s6, s6, s9
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s7
+; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0xc8
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V5-NEXT:    s_cselect_b32 s2, s2, 0
@@ -62,9 +62,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
-; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
+; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
@@ -88,9 +88,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ;
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
-; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s6, s9
-; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
@@ -124,6 +124,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x40
 ; GFX8V4-NEXT:    s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -136,6 +139,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xcc
 ; GFX8V5-NEXT:    s_load_dword s1, s[6:7], 0x4
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -178,6 +184,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x44
 ; GFX8V4-NEXT:    s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -190,6 +199,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xc8
 ; GFX8V5-NEXT:    s_load_dword s1, s[6:7], 0x4
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -273,7 +285,10 @@ define amdgpu_kernel void @llvm_debugtrap() {
 define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V4:       ; %bb.0:
+; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8V4-NEXT:    s_add_u32 s0, s8, 8
 ; GFX8V4-NEXT:    flat_load_ubyte v0, v[0:1] glc
@@ -298,6 +313,9 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V5:       ; %bb.0:
+; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
+; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8V5-NEXT:    s_add_u32 s0, s6, 8
 ; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V5-NEXT:    s_addc_u32 s1, s7, 0
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll
index f419d89a7f0a44..b283a8fca8a394 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-set-from-other-function.ll
@@ -68,6 +68,6 @@ if.end:
   ret void
 }
 ;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 4d62d30a38ed34..ff5c71b65f34bd 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,15 +8,15 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %11
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %12
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %12
   ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %9
-  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %9
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %10
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %10
   ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
@@ -27,15 +27,15 @@ define amdgpu_kernel void @s_input_output_i128() {
 define amdgpu_kernel void @v_input_output_i128() {
   ; GFX908-LABEL: name: v_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %11
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %11
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %12
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %12
   ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: v_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %9
-  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %9
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %10
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %10
   ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6553609 /* reguse:VReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=v"()
@@ -46,15 +46,15 @@ define amdgpu_kernel void @v_input_output_i128() {
 define amdgpu_kernel void @a_input_output_i128() {
   ; GFX908-LABEL: name: a_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %11
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %11
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %12
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %12
   ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: a_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %9
-  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %9
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %10
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %10
   ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = call i128 asm sideeffect "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index 3135addec16183..8a5c81fa8aa432 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -22,6 +22,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
 ; VI-LABEL: s_insertelement_v2bf16_0:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -82,6 +85,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
 ; VI-LABEL: s_insertelement_v2bf16_1:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -144,6 +150,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -216,6 +225,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -286,6 +298,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -358,6 +373,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -435,11 +453,14 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1)
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -530,14 +551,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, s4, v0, v4
@@ -609,14 +633,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, v0, s4, v4
@@ -686,14 +713,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, s4, v1, v4
@@ -765,14 +795,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, v1, s4, v4
@@ -848,9 +881,12 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1)
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -942,9 +978,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
@@ -1066,9 +1105,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
@@ -1245,11 +1287,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -1417,11 +1462,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 647870f0e08979..aafa4a04a00de9 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -20,6 +20,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2i16_0:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -69,6 +72,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -85,6 +91,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -140,6 +149,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -160,6 +172,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -222,6 +237,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -237,6 +255,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -293,6 +314,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -312,6 +336,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -382,6 +409,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -404,6 +434,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -475,6 +508,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2i16_1:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -523,6 +559,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -539,6 +578,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -589,6 +631,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2f16_0:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -636,6 +681,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2f16_1:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CIVI-NEXT:    s_add_i32 s10, s10, s15
+; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -684,6 +732,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -702,6 +753,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -760,9 +814,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -779,9 +836,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dword v3, v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -838,6 +898,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -856,6 +919,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -913,6 +979,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -931,6 +1000,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -987,6 +1059,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1005,6 +1080,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1061,6 +1139,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1079,6 +1160,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1135,6 +1219,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1153,6 +1240,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1209,6 +1299,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1227,6 +1320,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1283,6 +1379,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1301,6 +1400,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1363,6 +1465,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1382,6 +1487,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1446,9 +1554,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -1467,9 +1578,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dword v3, v[0:1]
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -1535,11 +1649,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1562,11 +1679,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    v_mov_b32_e32 v1, s5
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_load_dword v4, v[0:1]
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1637,14 +1757,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, s4, v0, v4
@@ -1656,9 +1779,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1718,14 +1844,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, v0, s4, v4
@@ -1737,9 +1866,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1800,14 +1932,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, s4, v1, v4
@@ -1819,9 +1954,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1881,14 +2019,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, v1, s4, v4
@@ -1900,9 +2041,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1963,14 +2107,17 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, s4, v1, v4
@@ -1982,9 +2129,12 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2050,6 +2200,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    flat_load_dword v4, v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
@@ -2077,6 +2230,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    flat_load_dword v4, v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
@@ -2160,9 +2316,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2186,9 +2345,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2258,9 +2420,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
@@ -2278,9 +2443,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2341,9 +2509,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2361,9 +2532,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2455,9 +2629,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2509,9 +2686,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2656,11 +2836,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -2683,9 +2866,12 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s3
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, s2, v8
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
@@ -2761,12 +2947,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT:    v_mov_b32_e32 v12, 0x3020504
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -2774,6 +2962,7 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, s0, v8
 ; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, 16, v8
+; VI-NEXT:    v_mov_b32_e32 v12, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_perm_b32 v3, s4, v3, v12
@@ -2787,11 +2976,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, 16, v0
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -2928,11 +3120,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -3025,11 +3220,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 5, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[7:10], v[2:3]
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index ae2beab4422ea4..b49931379b84a5 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -30,7 +30,7 @@ define hidden void @func() #1 {
 ; GCN-NOT: writelane
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 
-; GCN: ; NumSgprs: 33
+; GCN: ; NumSgprs: 37
 ; GCN: ; NumVgprs: 9
 define amdgpu_kernel void @kernel_call() #0 {
   %vgpr = load volatile i32, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
index 496a1c652da251..1a32953305bbc3 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
@@ -7,7 +7,7 @@ declare void @llvm.trap() #0
 ; DOORBELL-NEXT:     .amdhsa_group_segment_fixed_size 0
 ; DOORBELL-NEXT:     .amdhsa_private_segment_fixed_size 0
 ; DOORBELL-NEXT:     .amdhsa_kernarg_size 8
-; DOORBELL-NEXT:     .amdhsa_user_sgpr_count 12
+; DOORBELL-NEXT:     .amdhsa_user_sgpr_count 14
 ; DOORBELL-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; DOORBELL:      .end_amdhsa_kernel
 
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index aebc8315514fbc..d51ace630f6925 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -113,16 +113,24 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_normal(i32 %idx) {
 define amdgpu_kernel void @module_1_kernel_normal_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
-; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
+; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -168,16 +176,24 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_normal(i32 %idx) {
 define amdgpu_kernel void @module_1_kernel_overalign_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
-; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
+; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -223,16 +239,24 @@ define amdgpu_kernel void @module_0_kernel_normal_extern_overalign(i32 %idx) {
 define amdgpu_kernel void @module_1_kernel_normal_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
-; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
+; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -278,16 +302,24 @@ define amdgpu_kernel void @module_0_kernel_overalign_extern_overalign(i32 %idx)
 define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[6:7]
-; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[6:7], 0x0
-; CHECK-NEXT:    s_load_dword s12, s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_load_dwordx2 s[18:19], s[16:17], 0x0
+; CHECK-NEXT:    s_load_dword s15, s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[18:19]
 ; CHECK-NEXT:    s_lshl_b32 s4, s15, 2
@@ -320,7 +352,12 @@ define amdgpu_kernel void @module_1_kernel_overalign_extern_overalign(i32 %idx)
 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -334,8 +371,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 0
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -348,15 +385,24 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_normal(i32 %id
 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    s_mov_b32 s15, 4
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -383,7 +429,12 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_normal(i32 %id
 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -397,8 +448,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 2
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -411,15 +462,24 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_normal(i32
 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_normal:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    s_mov_b32 s15, 6
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -446,7 +506,12 @@ define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_normal(i32
 define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_normal_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -460,8 +525,8 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 1
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -474,15 +539,24 @@ define amdgpu_kernel void @module_0_kernel_normal_indirect_extern_overalign(i32
 define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_normal_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    s_mov_b32 s15, 5
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
@@ -509,7 +583,12 @@ define amdgpu_kernel void @module_1_kernel_normal_indirect_extern_overalign(i32
 define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_0_kernel_overalign_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
+; CHECK-NEXT:    s_mov_b32 s32, 0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; CHECK-NEXT:    s_add_u32 s8, s6, 8
@@ -523,8 +602,8 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 2
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 0
 ; CHECK-NEXT:    s_mov_b32 s15, 3
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    ds_write_b16 v0, v1
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
+; CHECK-NEXT:    ds_write_b16 v4, v3
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_endpgm
@@ -537,15 +616,24 @@ define amdgpu_kernel void @module_0_kernel_overalign_indirect_extern_overalign(i
 define amdgpu_kernel void @module_1_kernel_overalign_indirect_extern_overalign(i32 %idx) {
 ; CHECK-LABEL: module_1_kernel_overalign_indirect_extern_overalign:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_add_u32 s8, s4, 8
-; CHECK-NEXT:    s_addc_u32 s9, s5, 0
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, use_module at gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    s_add_u32 s10, s10, s15
 ; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; CHECK-NEXT:    s_addc_u32 s11, s11, 0
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
+; CHECK-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
+; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_addc_u32 s1, s1, 0
+; CHECK-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; CHECK-NEXT:    s_add_u32 s8, s6, 8
+; CHECK-NEXT:    s_addc_u32 s9, s7, 0
+; CHECK-NEXT:    s_getpc_b64 s[6:7]
+; CHECK-NEXT:    s_add_u32 s6, s6, use_module at gotpcrel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s7, s7, use_module at gotpcrel32@hi+12
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; CHECK-NEXT:    s_mov_b32 s15, 7
+; CHECK-NEXT:    v_or3_b32 v31, v0, v1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; CHECK-NEXT:    s_getpc_b64 s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 8fadfe3d026669..c998a007277933 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -23,8 +23,11 @@ define void @function_lds_id(ptr addrspace(1) %out) {
 define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
 ; GCN-LABEL: kernel_lds_id:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s2, s10, 42
+; GCN-NEXT:    s_add_i32 s2, s12, 42
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
@@ -42,19 +45,28 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
 ; GCN-LABEL: indirect_lds_id:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s7
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GCN-NEXT:    s_getpc_b64 s[8:9]
-; GCN-NEXT:    s_add_u32 s8, s8, function_lds_id at gotpcrel32@lo+4
-; GCN-NEXT:    s_addc_u32 s9, s9, function_lds_id at gotpcrel32@hi+12
-; GCN-NEXT:    s_load_dwordx2 s[8:9], s[8:9], 0x0
+; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
+; GCN-NEXT:    s_load_dwordx2 s[16:17], s[6:7], 0x0
+; GCN-NEXT:    s_add_u32 s8, s6, 8
+; GCN-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GCN-NEXT:    s_addc_u32 s9, s7, 0
+; GCN-NEXT:    s_getpc_b64 s[6:7]
+; GCN-NEXT:    s_add_u32 s6, s6, function_lds_id at gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s7, s7, function_lds_id at gotpcrel32@hi+12
+; GCN-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GCN-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
+; GCN-NEXT:    v_or_b32_e32 v31, v0, v2
 ; GCN-NEXT:    s_mov_b32 s15, 21
-; GCN-NEXT:    s_mov_b32 s12, s6
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
+; GCN-NEXT:    v_mov_b32_e32 v1, s17
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    s_endpgm
   call void @function_lds_id(ptr addrspace(1) %out)
   ret void
@@ -63,6 +75,9 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
 define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
 ; GCN-LABEL: doesnt_use_it:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x64
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index b061d53de5d3c5..d464269497b471 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -152,6 +152,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
 ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -162,6 +165,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
 ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -177,10 +183,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
 ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 32
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -189,10 +198,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -205,10 +217,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
 ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -217,11 +232,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -234,12 +252,15 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
 ; CHECK-SDAG-LABEL: test_readfirstlane_m0:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 m0, -1
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -247,12 +268,15 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
 ; CHECK-GISEL-LABEL: test_readfirstlane_m0:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 m0, -1
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -266,25 +290,31 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1
 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 s2, 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -298,13 +328,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -312,13 +345,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -332,13 +368,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -346,13 +385,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 24a332fa211c15..ffceac2c912bb6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -179,6 +179,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -189,6 +192,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
 ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -204,10 +210,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 32
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -216,10 +225,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -232,10 +244,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -244,11 +259,14 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -262,6 +280,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -281,6 +302,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -311,6 +335,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -332,6 +359,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -365,6 +395,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -386,6 +419,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -419,12 +455,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
 ; CHECK-SDAG-LABEL: test_readlane_m0_sreg:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 m0, -1
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -432,12 +471,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
 ; CHECK-GISEL-LABEL: test_readlane_m0_sreg:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 m0, -1
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -454,11 +496,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; def v0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v0, 32
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
@@ -468,10 +513,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    ; def v0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -485,14 +533,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; def v[0:1]
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v1, 32
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s3, v0, 32
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -505,10 +556,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
 ; CHECK-GISEL-NEXT:    ; def v[0:1]
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, 32
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -523,14 +577,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; def v[0:1]
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v1, 32
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s3, v0, 32
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -543,10 +600,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
 ; CHECK-GISEL-NEXT:    ; def v[0:1]
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, 32
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -561,25 +621,31 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou
 ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 s2, 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
 ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -593,13 +659,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
 ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -607,13 +676,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
 ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -627,13 +699,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
 ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -641,13 +716,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
 ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 9d93ca65683c42..da40a06c306b98 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -15,6 +15,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
 ; GFX802-SDAG-LABEL: test_writelane_sreg_i32:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -55,6 +58,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
 ; GFX802-GISEL-LABEL: test_writelane_sreg_i32:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -102,6 +108,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s6, s[6:7], 0x10
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
@@ -153,6 +162,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s6, s[6:7], 0x10
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
@@ -210,6 +222,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s6, s[6:7], 0x10
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
@@ -261,6 +276,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s6, s[6:7], 0x10
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
@@ -318,6 +336,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s2, s[6:7], 0x8
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -362,6 +383,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s2, s[6:7], 0x8
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -412,6 +436,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x8
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
@@ -462,6 +489,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x8
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -518,11 +548,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x8
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
@@ -573,11 +606,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x8
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s4
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
@@ -633,6 +669,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -694,6 +733,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -766,6 +808,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -833,6 +878,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -909,7 +957,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX802-SDAG-NEXT:    s_mov_b32 s4, 0x40280000
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -918,6 +968,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX802-SDAG-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT:    s_mov_b32 s4, 0x40280000
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
@@ -980,7 +1031,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
-; GFX802-GISEL-NEXT:    s_mov_b32 s4, 0x40280000
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -990,6 +1043,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX802-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT:    s_mov_b32 s4, 0x40280000
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1064,15 +1118,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
 ; GFX802-SDAG-NEXT:    ;;#ASMSTART
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, -1
 ; GFX802-SDAG-NEXT:    ;;#ASMEND
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    s_mov_b32 s4, m0
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s2
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
@@ -1119,15 +1176,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
 ; GFX802-GISEL-NEXT:    ;;#ASMSTART
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, -1
 ; GFX802-GISEL-NEXT:    ;;#ASMEND
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    s_mov_b32 s4, m0
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s4, m0
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
@@ -1178,6 +1238,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s2, s[6:7], 0x8
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -1222,6 +1285,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s2, s[6:7], 0x8
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1271,6 +1337,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
 ; GFX802-SDAG-LABEL: test_writelane_imm_i64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
@@ -1316,6 +1385,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
 ; GFX802-GISEL-LABEL: test_writelane_imm_i64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1367,6 +1439,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
 ; GFX802-SDAG-LABEL: test_writelane_imm_f64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
@@ -1412,6 +1487,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
 ; GFX802-GISEL-LABEL: test_writelane_imm_f64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1464,6 +1542,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
@@ -1503,6 +1584,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
@@ -1548,10 +1632,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
@@ -1596,11 +1683,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s8
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s3
@@ -1649,10 +1739,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
@@ -1697,11 +1790,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s8
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s3
@@ -1748,7 +1844,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
 ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, 42
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -1782,7 +1881,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
 ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, 42
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1822,11 +1924,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
@@ -1867,11 +1972,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s4
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1917,11 +2025,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x10
+; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
@@ -1962,11 +2073,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s4
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index 7202ab8b314669..59cc6dfac12005 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
 ; GFX7-HSA-LABEL: constant_load_f64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -93,6 +96,9 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu
 ; GFX7-HSA-LABEL: constant_load_2v4f64:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[20:21], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 22b718935738bd..8d833c40a44ee3 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -26,6 +26,9 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: constant_load_i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -108,6 +111,9 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v2i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -181,6 +187,9 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v3i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 4
@@ -287,6 +296,9 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v4i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -363,6 +375,9 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v8i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -449,6 +464,9 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GCN-HSA-LABEL: constant_load_v16i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -591,6 +609,9 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
 ; GCN-HSA-LABEL: constant_load_v16i16_align2:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
@@ -804,6 +825,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_zextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -881,6 +905,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_sextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -959,6 +986,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1036,6 +1066,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1112,6 +1145,9 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1199,6 +1235,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1290,6 +1329,9 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1389,6 +1431,9 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1493,6 +1538,9 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1601,6 +1649,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1720,6 +1771,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1880,6 +1934,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2059,6 +2116,9 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2323,6 +2383,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2631,6 +2694,9 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3111,6 +3177,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3681,6 +3750,9 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4604,6 +4676,9 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5392,6 +5467,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_zextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5480,6 +5558,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_sextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5567,6 +5648,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5650,6 +5734,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5736,12 +5823,15 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s0, s2, 16
 ; GCN-HSA-NEXT:    s_and_b32 s1, s2, 0xffff
@@ -5832,6 +5922,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -5937,10 +6030,13 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s4, s3, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s5, s2, 16
@@ -6073,6 +6169,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6231,10 +6330,13 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s8, s5, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s2, s7, 16
@@ -6447,6 +6549,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6710,10 +6815,13 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s12, s5, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s13, s7, 16
@@ -7090,6 +7198,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7568,10 +7679,13 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s19, s1, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s20, s3, 16
@@ -8281,6 +8395,9 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index f1a6bccc559f04..cf0cecde326e1a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -23,6 +23,9 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
 ; GFX7-HSA-LABEL: constant_load_i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -105,6 +108,9 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v2i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -194,6 +200,9 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v3i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -290,6 +299,9 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v4i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -390,6 +402,9 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX7-HSA-LABEL: constant_load_v8i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -526,6 +541,9 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX7-HSA-LABEL: constant_load_v9i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s12, s[10:11], 0x8
@@ -689,6 +707,9 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX7-HSA-LABEL: constant_load_v10i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[12:13], s[10:11], 0x8
@@ -861,6 +882,9 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
 ; GFX7-HSA-LABEL: constant_load_v11i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[12:15], s[2:3], 0x8
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
@@ -1038,6 +1062,9 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX7-HSA-LABEL: constant_load_v12i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[12:15], s[10:11], 0x8
@@ -1221,6 +1248,9 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
 ; GFX7-HSA-LABEL: constant_load_v16i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s18, s16, 48
@@ -1409,6 +1439,9 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
 ; GFX7-HSA-LABEL: constant_zextload_i32_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1495,6 +1528,9 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
 ; GFX7-HSA-LABEL: constant_sextload_i32_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1587,6 +1623,9 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1673,6 +1712,9 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1767,12 +1809,15 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s3
@@ -1867,6 +1912,9 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1983,13 +2031,16 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GFX7-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s6
@@ -2118,6 +2169,9 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2282,8 +2336,10 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s2, s0, 48
@@ -2291,6 +2347,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX7-HSA-NEXT:    s_add_u32 s2, s0, 32
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s11
@@ -2491,6 +2548,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -2789,6 +2849,9 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3245,13 +3308,16 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s18, s16, 0x70
 ; GFX7-HSA-NEXT:    s_addc_u32 s19, s17, 0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s18
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s19
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s14
@@ -3681,6 +3747,9 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x10
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4538,14 +4607,17 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[36:39], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s34, s36, 0xf0
 ; GFX7-HSA-NEXT:    s_addc_u32 s35, s37, 0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s34
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s30
@@ -5159,6 +5231,9 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
 ; GFX7-HSA-LABEL: constant_load_v32i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[36:39], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 46c7c2b08cd64b..45f0af8d423b67 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
 ; GFX7-LABEL: constant_load_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
@@ -97,6 +100,9 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-LABEL: constant_load_v2i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s0
@@ -183,6 +189,9 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-LABEL: constant_load_v3i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x4
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
@@ -299,6 +308,9 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX7-LABEL: constant_load_v4i64:
 ; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -430,6 +442,9 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-LABEL: constant_load_v8i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-NEXT:    s_add_u32 s18, s16, 48
@@ -649,6 +664,9 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
 ; GFX7-LABEL: constant_load_v16i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[36:39], s[6:7], 0x0
+; GFX7-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GFX7-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 67a376b8c0f3c5..22638d621932e1 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -27,6 +27,9 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
 ; GFX7-HSA-LABEL: constant_load_i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -114,6 +117,9 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v2i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -199,6 +205,9 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v3i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -311,6 +320,9 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v4i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -382,6 +394,9 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v8i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -458,6 +473,9 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v16i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -541,6 +559,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_zextload_i8_to_i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -618,6 +639,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_sextload_i8_to_i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -696,6 +720,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -773,6 +800,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -854,6 +884,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -954,6 +987,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1054,6 +1090,9 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1157,6 +1196,9 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1262,6 +1304,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1368,6 +1413,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1487,6 +1535,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1646,6 +1697,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1832,6 +1886,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2096,6 +2153,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2416,6 +2476,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2892,6 +2955,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3484,6 +3550,9 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4391,6 +4460,9 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5214,6 +5286,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_zextload_i8_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5298,6 +5373,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_sextload_i8_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5385,6 +5463,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5467,6 +5548,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5557,6 +5641,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5665,6 +5752,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5780,10 +5870,13 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_bfe_u32 s4, s2, 0x80008
 ; GFX7-HSA-NEXT:    s_lshr_b32 s3, s2, 24
@@ -5921,6 +6014,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6082,10 +6178,13 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_lshr_b32 s4, s2, 24
 ; GFX7-HSA-NEXT:    s_lshr_b32 s5, s3, 24
@@ -6306,6 +6405,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6578,10 +6680,13 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_lshr_b32 s8, s5, 24
 ; GFX7-HSA-NEXT:    s_lshr_b32 s9, s4, 24
@@ -6968,6 +7073,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7455,10 +7563,13 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_lshr_b32 s12, s4, 24
 ; GFX7-HSA-NEXT:    s_lshr_b32 s13, s5, 24
@@ -8198,6 +8309,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -8950,6 +9064,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_zextload_i8_to_i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9036,6 +9153,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_sextload_i8_to_i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9124,6 +9244,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9210,6 +9333,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9301,6 +9427,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9402,6 +9531,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9513,6 +9645,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -9651,6 +9786,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -9807,6 +9945,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -10012,6 +10153,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -10255,6 +10399,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10618,6 +10765,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11054,6 +11204,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ;
 ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16:
 ; GFX7-HSA:       ; %bb.0:
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -11729,6 +11882,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16:
 ; GFX7-HSA:       ; %bb.0:
+; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 4cc47b09d813d6..065e92d545cc96 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -28,6 +28,9 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(
 ; GCN-HSA-LABEL: global_load_i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -133,6 +136,9 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v2i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -219,6 +225,9 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v3i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -350,6 +359,9 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v4i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -435,6 +447,9 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v8i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -523,6 +538,9 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
 ; GCN-HSA-LABEL: global_load_v16i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
@@ -673,6 +691,9 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
 ; GCN-HSA-LABEL: global_load_v16i16_align2:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
@@ -822,6 +843,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_zextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -907,6 +931,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_sextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -995,6 +1022,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1080,6 +1110,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1170,6 +1203,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1269,6 +1305,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1370,6 +1409,9 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1480,6 +1522,9 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1597,6 +1642,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1712,6 +1760,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1834,6 +1885,9 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1983,6 +2037,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2147,6 +2204,9 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -2383,6 +2443,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2654,6 +2717,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -3065,6 +3131,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -3584,6 +3653,9 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -4387,6 +4459,9 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5151,6 +5226,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_zextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5248,6 +5326,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_sextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5343,6 +5424,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5435,6 +5519,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5533,6 +5620,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5642,6 +5732,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5760,6 +5853,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5905,6 +6001,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6065,10 +6164,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v4
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6083,8 +6182,11 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v4
@@ -6284,6 +6386,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6534,10 +6639,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6554,7 +6659,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x50
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v8
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v8
@@ -6914,6 +7022,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -7385,6 +7496,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -8087,6 +8201,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 7cdf270810dea0..88b67e2427fbeb 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -27,6 +27,9 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(
 ; GCNX3-HSA-LABEL: global_load_i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -106,6 +109,9 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v2i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -186,6 +192,9 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v3i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -270,6 +279,9 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v4i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -352,6 +364,9 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v8i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
@@ -458,6 +473,9 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v9i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -589,6 +607,9 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v10i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -719,6 +740,9 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v11i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -854,6 +878,9 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v12i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -987,6 +1014,9 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v16i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -1134,6 +1164,9 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr
 ; GCNX3-HSA-LABEL: global_zextload_i32_to_i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1217,6 +1250,9 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr
 ; GCNX3-HSA-LABEL: global_sextload_i32_to_i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1301,6 +1337,9 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1384,6 +1423,9 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1471,6 +1513,9 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1568,6 +1613,9 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1670,8 +1718,10 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, 0
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, v5
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1679,6 +1729,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, v5
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v2
@@ -1796,6 +1847,9 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1937,8 +1991,10 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, 0
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, v9
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1953,6 +2009,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 48
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, v9
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s1
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s0
@@ -2130,6 +2187,9 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2366,6 +2426,9 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2727,8 +2790,10 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, 0
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, v17
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -2762,6 +2827,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s1
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, v17
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s0
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
@@ -3115,6 +3181,9 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -3586,7 +3655,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-GFX900-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-GFX900-HSA-NEXT:    s_add_u32 s16, s16, s13
+; GCN-GFX900-HSA-NEXT:    s_add_u32 s16, s16, s15
 ; GCN-GFX900-HSA-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-GFX900-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
@@ -3902,6 +3971,9 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -4426,6 +4498,9 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v32i32:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
+; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index 4dfc773d615e4f..1a6fa3c518ca7a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -13,7 +13,8 @@
 ; GCN: s_cselect_b32
 
 ; GCN-NOT: load_dword
-; GCN: flat_load_dwordx2
+; GCN: flat_load_dword
+; GCN: flat_load_dword
 ; GCN-NOT: load_dword
 
 ; GCN: flat_store_dwordx2
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index 05ad567478675c..12df2b0ed9380d 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -161,7 +161,10 @@ define amdgpu_kernel void @k01() {
 ; GCN-LABEL: k01:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s7
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -182,6 +185,7 @@ define amdgpu_kernel void @k01() {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    s_endpgm
+
   call void @f0()
   call void @f1()
   ret void
@@ -197,7 +201,10 @@ define amdgpu_kernel void @k23() {
 ; GCN-LABEL: k23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s7
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -244,7 +251,10 @@ define amdgpu_kernel void @k123() {
 ; GCN-LABEL: k123:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s7
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index 3453ff9d296c09..72a0aceaae12b6 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -226,7 +226,10 @@ define amdgpu_kernel void @k01() {
 ; GCN-LABEL: k01:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s7
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -270,7 +273,10 @@ define amdgpu_kernel void @k23() {
 ; GCN-LABEL: k23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s7
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
@@ -317,7 +323,10 @@ define amdgpu_kernel void @k123() {
 ; GCN-LABEL: k123:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s32, 0
-; GCN-NEXT:    s_add_u32 s0, s0, s7
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s0, s0, s15
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    s_mov_b64 s[10:11], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
index e876a8d9dda692..245a2775d9f2fb 100644
--- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -9,7 +9,7 @@ declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
 
 ; GCN-LABEL: {{^}}get_global_id_0:
 ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
-; GCN: s_mul_i32 [[MUL:s[0-9]+]], s10, [[WGSIZEX]]
+; GCN: s_mul_i32 [[MUL:s[0-9]+]], s12, [[WGSIZEX]]
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, [[MUL]], v0
 define amdgpu_kernel void @get_global_id_0(ptr addrspace(1) %out) #1 {
   %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
index 92536c2078514a..e8632871f56ea4 100644
--- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
@@ -11,8 +11,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX9-NEXT:    s_mul_i32 s10, s10, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s10
+; GFX9-NEXT:    s_mul_i32 s12, s12, s4
+; GFX9-NEXT:    s_add_i32 s5, s5, s12
 ; GFX9-NEXT:    v_add_u32_e32 v0, s5, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 4, v[0:1]
@@ -39,8 +39,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX10-NEXT:    s_mul_i32 s10, s10, s4
-; GFX10-NEXT:    v_add3_u32 v0, s5, s10, v0
+; GFX10-NEXT:    s_mul_i32 s12, s12, s4
+; GFX10-NEXT:    v_add3_u32 v0, s5, s12, v0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 4, v[0:1]
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v4
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 0a76e169e9c385..0348737a41a30f 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -9,6 +9,8 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
 ; CHECK-LABEL: memcpy_p0_p0_minsize:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s3
@@ -186,7 +188,7 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_add_u32 s16, s16, s13
+; CHECK-NEXT:    s_add_u32 s16, s16, s15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
 ; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
@@ -566,7 +568,9 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x8
-; CHECK-NEXT:    s_add_u32 s16, s16, s13
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; CHECK-NEXT:    s_add_u32 s16, s16, s15
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
@@ -981,6 +985,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
 ; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
 ; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
@@ -1255,6 +1261,8 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
 ; CHECK-LABEL: memcpy_p0_p0_optsize:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s3
@@ -1432,7 +1440,7 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_add_u32 s16, s16, s13
+; CHECK-NEXT:    s_add_u32 s16, s16, s15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
 ; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
@@ -1812,7 +1820,9 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x8
-; CHECK-NEXT:    s_add_u32 s16, s16, s13
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; CHECK-NEXT:    s_add_u32 s16, s16, s15
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
@@ -2227,6 +2237,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
 ; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
 ; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
+; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 686797f290b97f..bbb70d53b0dd4b 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -34,10 +34,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -56,10 +59,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -146,6 +152,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_imin_sle_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -157,6 +166,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_imin_sle_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -218,6 +230,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
 ; CI-LABEL: s_test_imin_sle_v1i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -229,6 +244,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
 ; VI-LABEL: s_test_imin_sle_v1i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -292,6 +310,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
 ;
 ; CI-LABEL: s_test_imin_sle_v4i32:
 ; CI:       ; %bb.0:
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -310,6 +331,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
 ;
 ; VI-LABEL: s_test_imin_sle_v4i32:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -422,11 +446,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i8 s2, s2
 ; CI-NEXT:    s_sext_i32_i8 s3, s3
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_byte v[0:1], v2
@@ -437,11 +464,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_sext_i32_i8 s2, s2
 ; VI-NEXT:    s_sext_i32_i8 s3, s3
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_byte v[0:1], v2
@@ -559,6 +589,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_ashr_i32 s4, s2, 24
 ; CI-NEXT:    s_sext_i32_i8 s5, s2
@@ -582,6 +614,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; CI-NEXT:    s_and_b32 s3, s3, 0xffff
 ; CI-NEXT:    s_or_b32 s2, s3, s2
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -592,6 +625,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
 ; VI-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
@@ -612,6 +647,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -753,6 +789,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 ; CI-LABEL: s_test_imin_sle_v2i16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_ashr_i32 s4, s2, 16
 ; CI-NEXT:    s_sext_i32_i16 s2, s2
@@ -772,6 +811,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 ; VI-LABEL: s_test_imin_sle_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_ashr_i32 s4, s2, 16
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
@@ -906,6 +948,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_ashr_i32 s6, s0, 16
 ; CI-NEXT:    s_ashr_i32 s7, s1, 16
@@ -936,6 +981,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_ashr_i32 s6, s1, 16
 ; VI-NEXT:    s_sext_i32_i16 s1, s1
@@ -1034,10 +1082,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1056,10 +1107,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1175,10 +1229,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1197,10 +1254,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1288,6 +1348,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_imin_slt_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1299,6 +1362,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_imin_slt_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1362,6 +1428,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s1, s1, s3
 ; CI-NEXT:    s_min_i32 s0, s0, s2
@@ -1376,6 +1445,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s1, s1, s3
 ; VI-NEXT:    s_min_i32 s0, s0, s2
@@ -1450,6 +1522,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1462,6 +1537,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1529,6 +1607,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1541,6 +1622,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1620,10 +1704,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1642,10 +1729,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1751,12 +1841,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v6
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v2, s5
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, s4, v6
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; CI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
 ; CI-NEXT:    flat_load_dwordx3 v[3:5], v[3:4]
@@ -1775,12 +1868,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s5
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v6
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; VI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
 ; VI-NEXT:    flat_load_dwordx3 v[3:5], v[3:4]
@@ -1914,12 +2010,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
@@ -1950,12 +2049,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
@@ -2054,6 +2156,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_umin_ule_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2065,6 +2170,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_umin_ule_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2139,10 +2247,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2161,10 +2272,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2270,6 +2384,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s3
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, s2, v0
@@ -2291,6 +2408,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s3
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v0
@@ -2378,6 +2498,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_umin_ult_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2389,6 +2512,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_umin_ult_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2472,6 +2598,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ; CI-LABEL: v_test_umin_ult_i32_multi_use:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s5, s[6:7], 0x0
@@ -2493,6 +2622,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ; VI-LABEL: v_test_umin_ult_i32_multi_use:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s5, s[6:7], 0x0
@@ -2622,6 +2754,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; CI-LABEL: v_test_umin_ult_i16_multi_use:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s4
 ; CI-NEXT:    v_mov_b32_e32 v1, s5
@@ -2644,6 +2779,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; VI-LABEL: v_test_umin_ult_i16_multi_use:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
@@ -2736,6 +2874,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
 ; CI-LABEL: s_test_umin_ult_v1i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2747,6 +2888,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
 ; VI-LABEL: s_test_umin_ult_v1i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2818,6 +2962,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
 ;
 ; CI-LABEL: s_test_umin_ult_v8i32:
 ; CI:       ; %bb.0:
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx16 s[8:23], s[6:7], 0x8
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2849,6 +2996,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
 ;
 ; VI-LABEL: s_test_umin_ult_v8i32:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx16 s[8:23], s[6:7], 0x20
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3109,6 +3259,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
 ;
 ; CI-LABEL: s_test_umin_ult_v8i16:
 ; CI:       ; %bb.0:
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3155,6 +3308,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
 ;
 ; VI-LABEL: s_test_umin_ult_v8i16:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3280,11 +3436,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; CI-NEXT:    s_and_b32 s3, s3, 0xffff
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -3295,11 +3454,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    s_and_b32 s3, s3, 0xffff
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -3389,11 +3551,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i16 s2, s2
 ; CI-NEXT:    s_sext_i32_i16 s3, s3
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -3404,11 +3569,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
 ; VI-NEXT:    s_sext_i32_i16 s3, s3
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -3505,6 +3673,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i16 s3, s2
 ; CI-NEXT:    s_ashr_i32 s2, s2, 16
@@ -3519,6 +3690,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_sext_i32_i16 s3, s2
 ; VI-NEXT:    s_ashr_i32 s2, s2, 16
@@ -3601,6 +3775,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3619,6 +3796,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3711,6 +3891,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3729,6 +3912,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3821,6 +4007,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3839,6 +4028,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3931,6 +4123,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3949,6 +4144,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -4065,9 +4263,12 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    flat_load_dword v4, v[0:1]
@@ -4096,10 +4297,13 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -4217,9 +4421,12 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    flat_load_dword v4, v[0:1]
@@ -4247,10 +4454,13 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
index 5792fab7011afe..d7814c52828b88 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -180,6 +180,9 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -260,6 +263,9 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -341,6 +347,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -403,6 +412,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -465,6 +477,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -527,6 +542,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -588,6 +606,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT:    s_add_i32 s10, s10, s15
+; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
index 529e64715500dd..3de6945f95556b 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -176,6 +176,9 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1)
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -254,6 +257,9 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -333,6 +339,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 {
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -393,6 +402,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -454,6 +466,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 {
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -514,6 +529,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT:    s_add_i32 s10, s10, s15
+; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 58b61510c24e8b..176f04fb03506a 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -10,36 +10,36 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX908: bb.0 (%ir-block.0):
   ; REGALLOC-GFX908-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX908-NEXT: {{  $}}
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %26
-  ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %26
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %23
-  ; REGALLOC-GFX908-NEXT:   SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %27
+  ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %27
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %24
+  ; REGALLOC-GFX908-NEXT:   SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; REGALLOC-GFX908-NEXT:   [[COPY2:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
   ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
   ; REGALLOC-GFX908-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
   ; REGALLOC-GFX908-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   S_ENDPGM 0
   ;
   ; PEI-GFX908-LABEL: name: partial_copy
   ; PEI-GFX908: bb.0 (%ir-block.0):
-  ; PEI-GFX908-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
+  ; PEI-GFX908-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
   ; PEI-GFX908-NEXT: {{  $}}
-  ; PEI-GFX908-NEXT:   $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
-  ; PEI-GFX908-NEXT:   $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
-  ; PEI-GFX908-NEXT:   $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+  ; PEI-GFX908-NEXT:   $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+  ; PEI-GFX908-NEXT:   $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+  ; PEI-GFX908-NEXT:   $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
   ; PEI-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
   ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
   ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
-  ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+  ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
   ; PEI-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
@@ -48,7 +48,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
   ; PEI-GFX908-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-  ; PEI-GFX908-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+  ; PEI-GFX908-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -59,34 +59,34 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX90A: bb.0 (%ir-block.0):
   ; REGALLOC-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX90A-NEXT: {{  $}}
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %25
-  ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %25
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %23
-  ; REGALLOC-GFX90A-NEXT:   SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %26
+  ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %26
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %24
+  ; REGALLOC-GFX90A-NEXT:   SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; REGALLOC-GFX90A-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
   ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
   ; REGALLOC-GFX90A-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
   ; REGALLOC-GFX90A-NEXT:   [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64_align2 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   S_ENDPGM 0
   ;
   ; PEI-GFX90A-LABEL: name: partial_copy
   ; PEI-GFX90A: bb.0 (%ir-block.0):
-  ; PEI-GFX90A-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
+  ; PEI-GFX90A-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
   ; PEI-GFX90A-NEXT: {{  $}}
-  ; PEI-GFX90A-NEXT:   $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
-  ; PEI-GFX90A-NEXT:   $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
-  ; PEI-GFX90A-NEXT:   $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+  ; PEI-GFX90A-NEXT:   $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+  ; PEI-GFX90A-NEXT:   $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+  ; PEI-GFX90A-NEXT:   $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
   ; PEI-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
   ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
   ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
-  ; PEI-GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+  ; PEI-GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX90A-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
@@ -94,7 +94,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-  ; PEI-GFX90A-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+  ; PEI-GFX90A-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX90A-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
index 1fd311f225db4a..51e0eb6d497945 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
@@ -153,7 +153,7 @@ define internal void @merge_cycle_1() #3 {
 
 define amdgpu_kernel void @kernel_64_256() #7 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel_64_256
-; CHECK-SAME: () #[[ATTR6]] {
+; CHECK-SAME: () #[[ATTR8:[0-9]+]] {
 ; CHECK-NEXT:    call void @merge_cycle_0()
 ; CHECK-NEXT:    call void @default_captured_address()
 ; CHECK-NEXT:    call void @externally_visible_default()
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index 9577f2a932f966..e911df4aa4da3b 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -210,7 +210,7 @@ define void @externally_visible_default() {
 ; 1,10 -> 3,8
 define internal i32 @bitcasted_function() {
 ; CHECK-LABEL: define internal i32 @bitcasted_function
-; CHECK-SAME: () #[[ATTR8]] {
+; CHECK-SAME: () #[[ATTR10:[0-9]+]] {
 ; CHECK-NEXT:    ret i32 0
 ;
   ret i32 0
@@ -407,7 +407,7 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
 ; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index ab51d6e249e903..e1ce4d40b627d7 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -148,7 +148,7 @@ define amdgpu_kernel void @kernel_lds() {
 
 define internal i16 @mutual_recursion_0(i16 %arg) {
 ; CHECK-LABEL: define internal i16 @mutual_recursion_0(
-; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR5:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; CHECK-NEXT:    [[RECURSIVE_KERNEL_LDS:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[RECURSIVE_KERNEL_LDS]], align 4
@@ -168,7 +168,7 @@ define internal i16 @mutual_recursion_0(i16 %arg) {
 
 define internal void @mutual_recursion_1(i16 %arg) {
 ; CHECK-LABEL: define internal void @mutual_recursion_1(
-; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR5]] {
 ; CHECK-NEXT:    call void @mutual_recursion_0(i16 [[ARG]])
 ; CHECK-NEXT:    ret void
 ;
@@ -178,7 +178,7 @@ define internal void @mutual_recursion_1(i16 %arg) {
 
 define amdgpu_kernel void @kernel_lds_recursion() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_lds_recursion(
-; CHECK-SAME: ) #[[ATTR2]] !llvm.amdgcn.lds.kernel.id !9 {
+; CHECK-SAME: ) #[[ATTR6:[0-9]+]] !llvm.amdgcn.lds.kernel.id !9 {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds_recursion.lds) ]
 ; CHECK-NEXT:    call void @mutual_recursion_0(i16 0)
 ; CHECK-NEXT:    ret void
@@ -196,6 +196,8 @@ define amdgpu_kernel void @kernel_lds_recursion() {
 ; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index c2132cf907fdb2..847c2d343d4153 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -6,6 +6,9 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b,
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s2
@@ -33,9 +36,12 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x5a
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_sad_u32 v2, s2, v0, 20
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
@@ -57,6 +63,9 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b,
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s2
@@ -83,7 +92,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s16, s16, s13
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s16, s16, s15
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s3, s0, s1
@@ -92,6 +103,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -119,12 +131,15 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s16, s16, s13
-; GCN-NEXT:    s_addc_u32 s17, s17, 0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s16, s16, s15
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    v_sad_u32 v2, s0, v2, v3
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
@@ -151,7 +166,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s16, s16, s13
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s16, s16, s15
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_max_u32 s3, s0, s1
@@ -162,6 +179,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_store_dword v[0:1], v3
 ; GCN-NEXT:    s_endpgm
@@ -186,7 +204,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s16, s16, s13
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s16, s16, s15
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s3, s0, s1
@@ -197,6 +217,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_store_dword v[0:1], v3
 ; GCN-NEXT:    s_endpgm
@@ -222,7 +243,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s16, s16, s13
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s16, s16, s15
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s3, s0, s1
@@ -233,6 +256,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_store_dword v[0:1], v3
 ; GCN-NEXT:    s_endpgm
@@ -255,7 +279,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_u32 s16, s16, s13
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_add_u32 s16, s16, s15
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s3, s0, s1
@@ -264,6 +290,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    s_add_i32 s0, s0, s2
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -285,6 +312,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
 define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; GCN-LABEL: v_sad_u32_vector_pat1:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0xc
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
@@ -321,6 +351,9 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32
 define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; GCN-LABEL: v_sad_u32_vector_pat2:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0xc
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
@@ -358,6 +391,8 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
 ; GCN-NEXT:    s_load_dword s4, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 16
@@ -365,6 +400,7 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    v_sad_u32 v2, s4, v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    flat_store_short v[0:1], v2
 ; GCN-NEXT:    s_endpgm
@@ -384,6 +420,9 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
 define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) {
 ; GCN-LABEL: v_sad_u32_i16_pat2:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    flat_load_ushort v0, v[0:1] glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -416,6 +455,9 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s3, s2, 0xff
 ; GCN-NEXT:    s_bfe_u32 s4, s2, 0x80008
@@ -443,6 +485,9 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
 define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) {
 ; GCN-LABEL: v_sad_u32_i8_pat2:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -475,6 +520,9 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s3, s2, 0xff
 ; GCN-NEXT:    s_bfe_u32 s4, s2, 0x80008
@@ -502,6 +550,9 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_max_u32 s6, s0, s1
 ; GCN-NEXT:    s_cmp_le_u32 s0, s1
@@ -531,6 +582,9 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s3, s0, s3
 ; GCN-NEXT:    s_sub_i32 s6, s1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index 89a09dc4fcc171..9826585df8bd80 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -9,6 +9,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -24,6 +26,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX906-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX906-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX906-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -39,6 +43,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX908:       ; %bb.0: ; %entry
 ; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX908-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX908-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -55,6 +61,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -88,6 +96,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -103,6 +113,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX906-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX906-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX906-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -118,6 +130,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX908:       ; %bb.0: ; %entry
 ; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX908-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX908-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -134,6 +148,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
+; GFX90A-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
index ad82869c001f6f..597f2d875cac47 100644
--- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
@@ -26,175 +26,179 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x8
 ; CHECK-NEXT:    v_writelane_b32 v23, s3, 1
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[4:7]
+; CHECK-NEXT:    ; def s[48:51]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s[4:11]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_writelane_b32 v23, s4, 2
 ; CHECK-NEXT:    v_writelane_b32 v23, s5, 3
 ; CHECK-NEXT:    v_writelane_b32 v23, s6, 4
 ; CHECK-NEXT:    v_writelane_b32 v23, s7, 5
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[4:11]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 6
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 7
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 8
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 9
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 10
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 11
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 12
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 13
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 6
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 7
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 8
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 9
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:19]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 14
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 15
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 16
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 17
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 18
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 19
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 20
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 21
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 22
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 23
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 24
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 25
-; CHECK-NEXT:    v_writelane_b32 v23, s16, 26
-; CHECK-NEXT:    v_writelane_b32 v23, s17, 27
-; CHECK-NEXT:    v_writelane_b32 v23, s18, 28
-; CHECK-NEXT:    v_writelane_b32 v23, s19, 29
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 10
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 11
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 12
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 13
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 14
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 15
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 16
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 17
+; CHECK-NEXT:    v_writelane_b32 v23, s12, 18
+; CHECK-NEXT:    v_writelane_b32 v23, s13, 19
+; CHECK-NEXT:    v_writelane_b32 v23, s14, 20
+; CHECK-NEXT:    v_writelane_b32 v23, s15, 21
+; CHECK-NEXT:    v_writelane_b32 v23, s16, 22
+; CHECK-NEXT:    v_writelane_b32 v23, s17, 23
+; CHECK-NEXT:    v_writelane_b32 v23, s18, 24
+; CHECK-NEXT:    v_writelane_b32 v23, s19, 25
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[42:43]
+; CHECK-NEXT:    ; def s[38:39]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[52:55]
+; CHECK-NEXT:    ; def s[44:47]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:11]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 30
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 31
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 32
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 33
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 34
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 35
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 36
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 37
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 26
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 27
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 28
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 29
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 30
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 31
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 32
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 33
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[16:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[40:41]
+; CHECK-NEXT:    ; def s[36:37]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[36:39]
+; CHECK-NEXT:    ; def s[40:43]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[44:51]
+; CHECK-NEXT:    ; def s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 34
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 35
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 36
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 37
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 38
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 39
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 40
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 41
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 38
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 39
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 40
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 41
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 42
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 43
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 44
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 45
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 46
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 47
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 48
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 49
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 50
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 51
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 52
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 53
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 42
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 43
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 44
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 45
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 46
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 47
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 48
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 49
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 50
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 51
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 52
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 53
+; CHECK-NEXT:    v_writelane_b32 v23, s12, 54
+; CHECK-NEXT:    v_writelane_b32 v23, s13, 55
+; CHECK-NEXT:    v_writelane_b32 v23, s14, 56
+; CHECK-NEXT:    v_writelane_b32 v23, s15, 57
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[34:35]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 54
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 55
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 56
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 57
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:7]
-; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_writelane_b32 v23, s0, 58
 ; CHECK-NEXT:    v_writelane_b32 v23, s1, 59
 ; CHECK-NEXT:    v_writelane_b32 v23, s2, 60
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:    v_writelane_b32 v23, s3, 61
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 62
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 0
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 63
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 1
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s[0:7]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 62
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 0
+; CHECK-NEXT:    v_writelane_b32 v0, s3, 1
+; CHECK-NEXT:    v_writelane_b32 v0, s4, 2
+; CHECK-NEXT:    v_writelane_b32 v0, s5, 3
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 4
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 63
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 5
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 2
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 3
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 4
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 5
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 6
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 7
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 8
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 9
-; CHECK-NEXT:    v_writelane_b32 v0, s8, 10
-; CHECK-NEXT:    v_writelane_b32 v0, s9, 11
-; CHECK-NEXT:    v_writelane_b32 v0, s10, 12
-; CHECK-NEXT:    v_writelane_b32 v0, s11, 13
-; CHECK-NEXT:    v_writelane_b32 v0, s12, 14
-; CHECK-NEXT:    v_writelane_b32 v0, s13, 15
-; CHECK-NEXT:    v_writelane_b32 v0, s14, 16
-; CHECK-NEXT:    v_writelane_b32 v0, s15, 17
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 6
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 7
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 8
+; CHECK-NEXT:    v_writelane_b32 v0, s3, 9
+; CHECK-NEXT:    v_writelane_b32 v0, s4, 10
+; CHECK-NEXT:    v_writelane_b32 v0, s5, 11
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 12
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 13
+; CHECK-NEXT:    v_writelane_b32 v0, s8, 14
+; CHECK-NEXT:    v_writelane_b32 v0, s9, 15
+; CHECK-NEXT:    v_writelane_b32 v0, s10, 16
+; CHECK-NEXT:    v_writelane_b32 v0, s11, 17
+; CHECK-NEXT:    v_writelane_b32 v0, s12, 18
+; CHECK-NEXT:    v_writelane_b32 v0, s13, 19
+; CHECK-NEXT:    v_writelane_b32 v0, s14, 20
+; CHECK-NEXT:    v_writelane_b32 v0, s15, 21
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 18
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 19
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 22
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 23
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 20
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 21
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 22
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 23
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:7]
-; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_writelane_b32 v0, s0, 24
 ; CHECK-NEXT:    v_writelane_b32 v0, s1, 25
 ; CHECK-NEXT:    v_writelane_b32 v0, s2, 26
 ; CHECK-NEXT:    v_writelane_b32 v0, s3, 27
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 28
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 29
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 30
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 31
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s[0:7]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 28
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 29
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 30
+; CHECK-NEXT:    v_writelane_b32 v0, s3, 31
+; CHECK-NEXT:    v_writelane_b32 v0, s4, 32
+; CHECK-NEXT:    v_writelane_b32 v0, s5, 33
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 34
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 35
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 32
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 33
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 34
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 35
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 36
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 37
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 38
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 39
-; CHECK-NEXT:    v_writelane_b32 v0, s8, 40
-; CHECK-NEXT:    v_writelane_b32 v0, s9, 41
-; CHECK-NEXT:    v_writelane_b32 v0, s10, 42
-; CHECK-NEXT:    v_writelane_b32 v0, s11, 43
-; CHECK-NEXT:    v_writelane_b32 v0, s12, 44
-; CHECK-NEXT:    v_writelane_b32 v0, s13, 45
-; CHECK-NEXT:    v_writelane_b32 v0, s14, 46
-; CHECK-NEXT:    v_writelane_b32 v0, s15, 47
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 36
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 37
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 38
+; CHECK-NEXT:    v_writelane_b32 v0, s3, 39
+; CHECK-NEXT:    v_writelane_b32 v0, s4, 40
+; CHECK-NEXT:    v_writelane_b32 v0, s5, 41
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 42
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 43
+; CHECK-NEXT:    v_writelane_b32 v0, s8, 44
+; CHECK-NEXT:    v_writelane_b32 v0, s9, 45
+; CHECK-NEXT:    v_writelane_b32 v0, s10, 46
+; CHECK-NEXT:    v_writelane_b32 v0, s11, 47
+; CHECK-NEXT:    v_writelane_b32 v0, s12, 48
+; CHECK-NEXT:    v_writelane_b32 v0, s13, 49
+; CHECK-NEXT:    v_writelane_b32 v0, s14, 50
+; CHECK-NEXT:    v_writelane_b32 v0, s15, 51
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %ret
 ; CHECK-NEXT:    ; kill: killed $vgpr23
@@ -210,166 +214,170 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    v_readlane_b32 s1, v23, 3
 ; CHECK-NEXT:    v_readlane_b32 s2, v23, 4
 ; CHECK-NEXT:    v_readlane_b32 s3, v23, 5
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 6
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 7
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 8
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 9
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:3]
+; CHECK-NEXT:    ; use s[48:51]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 6
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 7
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 8
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 9
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 10
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 11
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 12
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 13
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 14
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 15
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 16
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 17
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 18
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 19
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 20
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 21
-; CHECK-NEXT:    v_readlane_b32 s8, v23, 22
-; CHECK-NEXT:    v_readlane_b32 s9, v23, 23
-; CHECK-NEXT:    v_readlane_b32 s10, v23, 24
-; CHECK-NEXT:    v_readlane_b32 s11, v23, 25
-; CHECK-NEXT:    v_readlane_b32 s12, v23, 26
-; CHECK-NEXT:    v_readlane_b32 s13, v23, 27
-; CHECK-NEXT:    v_readlane_b32 s14, v23, 28
-; CHECK-NEXT:    v_readlane_b32 s15, v23, 29
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 10
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 11
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 12
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 13
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 14
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 15
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 16
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 17
+; CHECK-NEXT:    v_readlane_b32 s8, v23, 18
+; CHECK-NEXT:    v_readlane_b32 s9, v23, 19
+; CHECK-NEXT:    v_readlane_b32 s10, v23, 20
+; CHECK-NEXT:    v_readlane_b32 s11, v23, 21
+; CHECK-NEXT:    v_readlane_b32 s12, v23, 22
+; CHECK-NEXT:    v_readlane_b32 s13, v23, 23
+; CHECK-NEXT:    v_readlane_b32 s14, v23, 24
+; CHECK-NEXT:    v_readlane_b32 s15, v23, 25
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 30
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 31
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 32
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 33
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 34
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 35
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 36
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 37
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 26
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 27
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 28
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 29
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 30
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 31
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 32
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 33
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[42:43]
+; CHECK-NEXT:    ; use s[38:39]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[52:55]
+; CHECK-NEXT:    ; use s[44:47]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 38
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 39
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 40
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 41
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 34
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 35
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 36
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 37
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 38
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 39
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 40
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 41
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[16:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[40:41]
+; CHECK-NEXT:    ; use s[36:37]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[36:39]
+; CHECK-NEXT:    ; use s[40:43]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[44:51]
+; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 42
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 43
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 44
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 45
-; CHECK-NEXT:    v_readlane_b32 s8, v23, 46
-; CHECK-NEXT:    v_readlane_b32 s9, v23, 47
-; CHECK-NEXT:    v_readlane_b32 s10, v23, 48
-; CHECK-NEXT:    v_readlane_b32 s11, v23, 49
-; CHECK-NEXT:    v_readlane_b32 s12, v23, 50
-; CHECK-NEXT:    v_readlane_b32 s13, v23, 51
-; CHECK-NEXT:    v_readlane_b32 s14, v23, 52
-; CHECK-NEXT:    v_readlane_b32 s15, v23, 53
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 42
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 43
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 44
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 45
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 46
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 47
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 48
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 49
+; CHECK-NEXT:    v_readlane_b32 s8, v23, 50
+; CHECK-NEXT:    v_readlane_b32 s9, v23, 51
+; CHECK-NEXT:    v_readlane_b32 s10, v23, 52
+; CHECK-NEXT:    v_readlane_b32 s11, v23, 53
+; CHECK-NEXT:    v_readlane_b32 s12, v23, 54
+; CHECK-NEXT:    v_readlane_b32 s13, v23, 55
+; CHECK-NEXT:    v_readlane_b32 s14, v23, 56
+; CHECK-NEXT:    v_readlane_b32 s15, v23, 57
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 54
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 55
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 56
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 57
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 58
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 59
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 60
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 61
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[34:35]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 58
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 59
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 60
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 61
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 62
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 63
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 0
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 1
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 62
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 63
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 0
+; CHECK-NEXT:    v_readlane_b32 s3, v0, 1
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 2
+; CHECK-NEXT:    v_readlane_b32 s5, v0, 3
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 4
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 5
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 2
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 3
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 4
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 5
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 6
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 7
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 8
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 9
-; CHECK-NEXT:    v_readlane_b32 s8, v0, 10
-; CHECK-NEXT:    v_readlane_b32 s9, v0, 11
-; CHECK-NEXT:    v_readlane_b32 s10, v0, 12
-; CHECK-NEXT:    v_readlane_b32 s11, v0, 13
-; CHECK-NEXT:    v_readlane_b32 s12, v0, 14
-; CHECK-NEXT:    v_readlane_b32 s13, v0, 15
-; CHECK-NEXT:    v_readlane_b32 s14, v0, 16
-; CHECK-NEXT:    v_readlane_b32 s15, v0, 17
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 6
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 7
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 8
+; CHECK-NEXT:    v_readlane_b32 s3, v0, 9
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 10
+; CHECK-NEXT:    v_readlane_b32 s5, v0, 11
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 12
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 13
+; CHECK-NEXT:    v_readlane_b32 s8, v0, 14
+; CHECK-NEXT:    v_readlane_b32 s9, v0, 15
+; CHECK-NEXT:    v_readlane_b32 s10, v0, 16
+; CHECK-NEXT:    v_readlane_b32 s11, v0, 17
+; CHECK-NEXT:    v_readlane_b32 s12, v0, 18
+; CHECK-NEXT:    v_readlane_b32 s13, v0, 19
+; CHECK-NEXT:    v_readlane_b32 s14, v0, 20
+; CHECK-NEXT:    v_readlane_b32 s15, v0, 21
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 18
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 19
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 22
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 23
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 20
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 21
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 22
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 23
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:3]
-; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_readlane_b32 s0, v0, 24
 ; CHECK-NEXT:    v_readlane_b32 s1, v0, 25
 ; CHECK-NEXT:    v_readlane_b32 s2, v0, 26
 ; CHECK-NEXT:    v_readlane_b32 s3, v0, 27
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 28
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 29
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 30
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 31
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[0:3]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 28
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 29
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 30
+; CHECK-NEXT:    v_readlane_b32 s3, v0, 31
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 32
+; CHECK-NEXT:    v_readlane_b32 s5, v0, 33
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 34
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 35
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 32
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 33
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 34
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 35
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 36
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 37
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 38
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 39
-; CHECK-NEXT:    v_readlane_b32 s8, v0, 40
-; CHECK-NEXT:    v_readlane_b32 s9, v0, 41
-; CHECK-NEXT:    v_readlane_b32 s10, v0, 42
-; CHECK-NEXT:    v_readlane_b32 s11, v0, 43
-; CHECK-NEXT:    v_readlane_b32 s12, v0, 44
-; CHECK-NEXT:    v_readlane_b32 s13, v0, 45
-; CHECK-NEXT:    v_readlane_b32 s14, v0, 46
-; CHECK-NEXT:    v_readlane_b32 s15, v0, 47
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 36
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 37
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 38
+; CHECK-NEXT:    v_readlane_b32 s3, v0, 39
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 40
+; CHECK-NEXT:    v_readlane_b32 s5, v0, 41
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 42
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 43
+; CHECK-NEXT:    v_readlane_b32 s8, v0, 44
+; CHECK-NEXT:    v_readlane_b32 s9, v0, 45
+; CHECK-NEXT:    v_readlane_b32 s10, v0, 46
+; CHECK-NEXT:    v_readlane_b32 s11, v0, 47
+; CHECK-NEXT:    v_readlane_b32 s12, v0, 48
+; CHECK-NEXT:    v_readlane_b32 s13, v0, 49
+; CHECK-NEXT:    v_readlane_b32 s14, v0, 50
+; CHECK-NEXT:    v_readlane_b32 s15, v0, 51
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index ebc916b5c889b5..f69b7ae1051240 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -182,8 +182,10 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: s_shl_i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, s4, 64
 ; GCN-NEXT:    s_sub_i32 s12, 64, s4
@@ -203,6 +205,7 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
@@ -215,8 +218,10 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: s_lshr_i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, s4, 64
 ; GCN-NEXT:    s_sub_i32 s12, 64, s4
@@ -236,6 +241,7 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s6
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
@@ -248,8 +254,10 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: s_ashr_i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, 64, s4
 ; GCN-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
@@ -270,6 +278,7 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
@@ -430,6 +439,9 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_shl_v2i128ss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v6, 16
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
@@ -502,6 +514,9 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_lshr_v2i128_ss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v6, 16
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
@@ -574,6 +589,9 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_ashr_v2i128_ss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v6, 16
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
index 850446c414049d..840e8bdfb7bab3 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
@@ -58,9 +58,6 @@ entry:
 }
 
 ;.
-; OW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
-;.
-; CW: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index b8721129222043..a38f0a6d86b8ca 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -9,6 +9,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s2
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -20,6 +23,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -38,11 +44,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -52,11 +61,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -72,6 +84,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_bitcmp1_b32 s2, 0
 ; CI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -86,6 +101,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitcmp1_b32 s2, 0
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -104,6 +122,9 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; CI-LABEL: s_sint_to_fp_i64_to_f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s3
 ; CI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -117,6 +138,9 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; VI-LABEL: s_sint_to_fp_i64_to_f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s3
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -136,6 +160,9 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
@@ -155,6 +182,9 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -183,6 +213,9 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i8 s2, s2
 ; CI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s2
@@ -195,6 +228,9 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bfe_i32 s2, s2, 0x80000
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
@@ -232,11 +268,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -246,11 +285,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -283,11 +325,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -297,11 +342,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -353,11 +401,14 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    v_mov_b32_e32 v0, 0
+; CI-NEXT:    s_add_i32 s10, s10, s15
+; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0, 0xbff00000
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -367,11 +418,14 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0, 0xbff00000
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index b4a981f1db4ec7..0a6009c0e7da8a 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -12,10 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
   ; GCN-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %30.sub0
-  ; GCN-NEXT:   SI_SPILL_V64_SAVE %30, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %31.sub0
+  ; GCN-NEXT:   SI_SPILL_V64_SAVE %31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
-  ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %22:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %23:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
   ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]]
   ; GCN-NEXT:   S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 3644bef9c20a1f..b2e334c66ccd25 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -50,7 +50,10 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 {
 define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
 ; HAWAII-LABEL: local_store_i55:
 ; HAWAII:       ; %bb.0:
+; HAWAII-NEXT:    s_add_i32 s10, s10, s15
 ; HAWAII-NEXT:    s_or_b32 s0, s6, 14
+; HAWAII-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; HAWAII-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
 ; HAWAII-NEXT:    v_mov_b32_e32 v1, s7
 ; HAWAII-NEXT:    flat_load_ubyte v0, v[0:1]
@@ -70,7 +73,10 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
 ;
 ; FIJI-LABEL: local_store_i55:
 ; FIJI:       ; %bb.0:
+; FIJI-NEXT:    s_add_i32 s10, s10, s15
 ; FIJI-NEXT:    s_or_b32 s0, s6, 14
+; FIJI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; FIJI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; FIJI-NEXT:    v_mov_b32_e32 v0, s0
 ; FIJI-NEXT:    v_mov_b32_e32 v1, s7
 ; FIJI-NEXT:    flat_load_ubyte v0, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index 19d633651fdd0d..30accc846d2b6a 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000  ................
-; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000  @...............
+; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
 
 ; ELF: AMDGPU Metadata
 ; ELF: .sgpr_count:     9
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index 2097579e0c9959..4f84b31f1877b0 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000  ................
-; OBJ-NEXT: 0030 0000af00 88000000 01000000 00000000  ................
+; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......
 
 ; ELF: AMDGPU Metadata
 ; ELF: .sgpr_count:     5
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index 775c62e73261a9..0b1bd11b88d5d6 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack < %s | FileCheck --check-prefixes=ASM %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJ %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-readelf --notes - | FileCheck --check-prefixes=ELF %s
@@ -6,15 +7,17 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM: .amdhsa_next_free_sgpr 5
-; ASM: .amdhsa_reserve_xnack_mask 1
+; ASM:       ; %bb.0: ; %entry
+; ASM-NEXT:    ;;#ASMSTART
+; ASM-NEXT:    ;;#ASMEND
+; ASM-NEXT:    s_endpgm
 
 ; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
 ; OBJ: Contents of section .rodata:
 ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000  ................
-; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000  @...............
+; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000  @.......!.......
 
 ; ELF: AMDGPU Metadata
 ; ELF: .sgpr_count:     9
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 52370f6a2ef054..3d1f283270ccfd 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -23,11 +23,14 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 ; HSA-TRAP-GFX803-LABEL: trap:
 ; HSA-TRAP-GFX803:       ; %bb.0:
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v2, 1
-; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s2
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s3
+; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; HSA-TRAP-GFX803-NEXT:    flat_store_dword v[0:1], v2
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    s_trap 2
@@ -121,6 +124,9 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
 ; HSA-TRAP-GFX803-LABEL: non_entry_trap:
 ; HSA-TRAP-GFX803:       ; %bb.0: ; %entry
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s0
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s1
@@ -283,6 +289,9 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
 ; HSA-TRAP-GFX803:       ; %bb.0:
 ; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s4
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s5
@@ -417,10 +426,13 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
 ; HSA-TRAP-GFX803-LABEL: debugtrap:
 ; HSA-TRAP-GFX803:       ; %bb.0:
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v2, 1
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v3, 2
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v3, 2
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; HSA-TRAP-GFX803-NEXT:    flat_store_dword v[0:1], v2
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 735956caa72da4..317e350f3eafe5 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -81,6 +81,9 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-LABEL: udiv_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -252,6 +255,9 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GCN-LABEL: s_udiv_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GCN-NEXT:    s_sub_i32 s4, 0, s3
@@ -457,6 +463,9 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: udiv_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -810,6 +819,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: udiv_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-NEXT:    s_addc_u32 s5, s3, 0
@@ -1135,6 +1147,9 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac
 ; GCN-LABEL: udiv_i32_div_pow2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1224,6 +1239,9 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: udiv_i32_div_k_even:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1318,6 +1336,9 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
 ; GCN-LABEL: udiv_i32_div_k_odd:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1430,6 +1451,9 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-LABEL: v_udiv_i8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1570,6 +1594,9 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: v_udiv_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1726,6 +1753,9 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: v_udiv_i23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s4, s2, 4
 ; GCN-NEXT:    s_addc_u32 s5, s3, 0
@@ -1923,6 +1953,9 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: v_udiv_i24:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s4, s2, 4
 ; GCN-NEXT:    s_addc_u32 s5, s3, 0
@@ -2105,6 +2138,9 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
 ; GCN-LABEL: scalarize_mulhu_4xi32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -2218,6 +2254,9 @@ define amdgpu_kernel void @test_udiv2(i32 %p) {
 ; GCN-LABEL: test_udiv2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
@@ -2281,6 +2320,9 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
@@ -2371,6 +2413,9 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon
 ;
 ; GCN-LABEL: fdiv_test_denormals:
 ; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    flat_load_sbyte v2, v[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index d00ea6dff24474..44b16d7f65dc55 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -9,6 +9,9 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
@@ -28,6 +31,9 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -54,6 +60,9 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; SI-LABEL: s_uint_to_fp_i64_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -67,6 +76,9 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; VI-LABEL: s_uint_to_fp_i64_to_f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -86,6 +98,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s1
@@ -103,6 +118,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
 ; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s1
@@ -126,6 +144,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
 define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) {
 ; SI-LABEL: s_uint_to_fp_v4i64_to_v4f64:
 ; SI:       ; %bb.0:
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x8
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -158,6 +179,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
 ;
 ; VI-LABEL: s_uint_to_fp_v4i64_to_v4f64:
 ; VI:       ; %bb.0:
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x20
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -196,6 +220,9 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
@@ -207,6 +234,9 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -222,6 +252,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2
 ; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
+; GCN-NEXT:    s_add_i32 s10, s10, s15
+; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[2:3], s3
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
@@ -239,6 +272,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[6:7], s3
@@ -259,6 +295,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], s3
@@ -286,11 +325,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -300,11 +342,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -320,6 +365,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_bitcmp1_b32 s2, 0
 ; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -334,6 +382,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitcmp1_b32 s2, 0
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -353,6 +404,9 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_and_b32 s2, s2, 0xff
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
@@ -365,6 +419,9 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0xff
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
@@ -402,11 +459,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -416,11 +476,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -453,11 +516,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -467,11 +533,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -505,11 +574,14 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_add_i32 s10, s10, s15
+; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0, 0x3ff00000
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -519,11 +591,14 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, 0
+; VI-NEXT:    s_add_i32 s10, s10, s15
+; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0, 0x3ff00000
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 3046480b3c0d2a..fa4db7d4a2d550 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -25,8 +25,9 @@
 ; CHECK-NEXT: argumentInfo:
 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr6' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr8' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' }
 ; CHECK-NEXT: workItemIDX:     { reg: '$vgpr0' }
 ; CHECK-NEXT: psInputAddr:     0
 ; CHECK-NEXT: psInputEnable:   0
@@ -40,7 +41,7 @@
 ; CHECK-NEXT: BitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy:       8
 ; CHECK-NEXT: vgprForAGPRCopy: ''
-; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
+; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99'
 ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
 ; CHECK-NEXT: body:
   define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index 3f6f0c909e8bbf..99c3ed91351dc1 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -25,8 +25,9 @@
 ; CHECK-NEXT: argumentInfo:
 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr6' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr8' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' }
 ; CHECK-NEXT: workItemIDX:     { reg: '$vgpr0' }
 ; CHECK-NEXT: psInputAddr:     0
 ; CHECK-NEXT: psInputEnable:   0
@@ -40,7 +41,7 @@
 ; CHECK-NEXT: BitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy:       8
 ; CHECK-NEXT: vgprForAGPRCopy: ''
-; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
+; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99'
 ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
 ; CHECK-NEXT: body:
 define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 {

>From 6798dc40bbcee98ec3a283a0417564e9a28ce457 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Sat, 17 Aug 2024 16:40:41 -0700
Subject: [PATCH 3/8] Previous code was incorrect for indirect calls of known
 callees.

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |  17 +-
 .../AMDGPU/amdgpu-attributor-no-agpr.ll       |   3 +-
 .../AMDGPU/attributor-flatscratchinit.ll      | 359 ++++++++++--------
 .../CodeGen/AMDGPU/propagate-waves-per-eu.ll  |   2 +-
 .../AMDGPU/remove-no-kernel-id-attribute.ll   |   6 +-
 5 files changed, 205 insertions(+), 182 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 1d3a3cea3e2187..c4c3a06155029c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -692,20 +692,23 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
   // Returns true if FlatScratchInit is needed, i.e., no-flat-scratch-init is
   // not to be set.
   bool needFlatScratchInit(Attributor &A) {
+    assert(isAssumed(FLAT_SCRATCH_INIT)); // only called if the bit is still set
+
     // This is called on each callee; false means callee shouldn't have
     // no-flat-scratch-init.
     auto CheckForNoFlatScratchInit = [&](Instruction &I) {
       const auto &CB = cast<CallBase>(I);
       const Function *Callee = CB.getCalledFunction();
-      if (!Callee) // indirect call
-        return CB.isInlineAsm();
 
-      if (Callee->isIntrinsic())
-        return Callee->getIntrinsicID() != Intrinsic::amdgcn_addrspacecast_nonnull;
+      if (Callee && Callee->isIntrinsic())
+        return Callee->getIntrinsicID() !=
+               Intrinsic::amdgcn_addrspacecast_nonnull;
 
-      const auto *CalleeInfo = A.getAAFor<AAAMDAttributes>(
-          *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
-      return CalleeInfo && CalleeInfo->isAssumed(FLAT_SCRATCH_INIT);
+      // Return true for all other cases, including (1)inline asm, (2)direct
+      // call, and (3)indirect call with known callees. For (2) and (3)
+      // updateImpl() already checked the callees and we know their
+      // FLAT_SCRATCH_INIT bit is set.
+      return true;
     };
 
     bool UsedAssumedInformation = false;
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
index 5ace66fd2dd762..c96c6d39a20b51 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -229,7 +229,7 @@ define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) {
 
 define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
 ; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr(
-; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR7:[0-9]+]] {
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty
 ; CHECK-NEXT:    call void [[FPTR]]()
 ; CHECK-NEXT:    ret void
@@ -249,7 +249,6 @@ attributes #0 = { "amdgpu-no-agpr" }
 ; CHECK: attributes #[[ATTR4]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
 ; CHECK: attributes #[[ATTR9:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
 ; CHECK: attributes #[[ATTR10]] = { "amdgpu-no-agpr" }
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index 6d9da9281211cb..f04c93961b6705 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -1,6 +1,6 @@
 ; Test the generation of the attribute amdgpu-no-flat-scratch-init
-; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s
 
 ;; tests of alloca
 
@@ -160,40 +160,40 @@ define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
 
 define void @call_without_alloca() {
 ; GFX9-LABEL: define void @call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   call void @without_alloca(i1 true)
   ret void
 }
 
 define amdgpu_kernel void @call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   call void @without_alloca(i1 true)
   ret void
 }
 
 define void @call_with_alloca() {
 ; GFX9-LABEL: define void @call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   call void @with_alloca()
   ret void
 }
 
 define amdgpu_kernel void @call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   call void @with_alloca()
   ret void
 }
@@ -222,50 +222,50 @@ define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
 
 define void @call_call_without_alloca() {
 ; GFX9-LABEL: define void @call_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   call void @call_without_alloca()
   ret void
 }
 
 define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   call void @call_without_alloca()
   ret void
 }
 
 define void @call_call_with_alloca() {
 ; GFX9-LABEL: define void @call_call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   call void @call_with_alloca()
   ret void
 }
 
 define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   call void @call_with_alloca()
   ret void
 }
 
 define void @with_alloca_call_without_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @without_alloca()
@@ -274,10 +274,10 @@ define void @with_alloca_call_without_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @without_alloca()
@@ -286,10 +286,10 @@ define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
 
 define void @with_alloca_call_with_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @with_alloca()
@@ -298,10 +298,10 @@ define void @with_alloca_call_with_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @with_alloca()
@@ -310,10 +310,10 @@ define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
 
 define void @with_alloca_call_call_without_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_without_alloca()
@@ -322,10 +322,10 @@ define void @with_alloca_call_call_without_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_without_alloca()
@@ -334,10 +334,10 @@ define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
 
 define void @with_alloca_call_call_with_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_with_alloca()
@@ -346,10 +346,10 @@ define void @with_alloca_call_call_with_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_with_alloca()
@@ -360,30 +360,30 @@ define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
 
 define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
 
 define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(1) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -391,10 +391,10 @@ define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 
 define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(1) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -402,30 +402,30 @@ define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrs
 
 define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(2) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(2) %ptr
   ret void
 }
 
 define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(2) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -433,10 +433,10 @@ define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 
 define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(2) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -444,30 +444,30 @@ define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrs
 
 define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(3) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(3) %ptr
   ret void
 }
 
 define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -475,10 +475,10 @@ define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 
 define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -486,30 +486,30 @@ define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrsp
 
 define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(4) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(4) %ptr
   ret void
 }
 
 define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(4) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -517,10 +517,10 @@ define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
 
 define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(4) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -528,30 +528,30 @@ define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr add
 
 define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(5) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(5) %ptr
   ret void
 }
 
 define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -559,10 +559,10 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 
 define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -570,50 +570,50 @@ define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addr
 
 define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -621,10 +621,10 @@ define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrsp
 
 define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -632,78 +632,70 @@ define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacec
 
 define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
-
-
-
-
-
-
-
-
 define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -712,10 +704,10 @@ define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(
 
 define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -724,10 +716,10 @@ define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_
 
 define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -736,10 +728,10 @@ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5)
 
 define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI6:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI6:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -750,10 +742,10 @@ define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_
 
 define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
   call void @without_alloca(i1 true)
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -761,10 +753,10 @@ define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr
 
 define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   call void @without_alloca(i1 true)
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -772,10 +764,10 @@ define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kern
 
 define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
   call void @without_alloca(i1 true)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -783,16 +775,16 @@ define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
 
 define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
   call void @without_alloca(i1 true)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
-;; tests of indirect call, intrinsics
+;; tests of indirect call, intrinsics, inline asm
 
 @gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
 
@@ -838,14 +830,33 @@ define amdgpu_kernel void @call_with_indirect_call_cc_kernel() {
   ret void
 }
 
+define void @empty() {
+  ret void
+}
+
+define void @also_empty() {
+  ret void
+}
+
+define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) {
+; GFX9-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond)
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond)
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+  %fptr = select i1 %cond, ptr @empty, ptr @also_empty
+  call void %fptr()
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workgroup.id.x()
 
 define void @use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI7:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI7:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %val, ptr addrspace(1) undef
   ret void
@@ -853,10 +864,10 @@ define void @use_intrinsic_workitem_id_x() {
 
 define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %val, ptr addrspace(1) undef
   ret void
@@ -864,84 +875,96 @@ define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
 
 define void @call_use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI7:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4]]
 ;
 ; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI7:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
 define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI5:[0-9]+]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
+define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+  %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr)
+  store volatile i32 7, ptr %1, align 4
+  ret void
+}
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI2]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI3]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI4]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) {
+; GFX9-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+;
+; GFX10-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
+; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+  call void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
+  ret void
+}
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI3]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+define amdgpu_kernel void @with_inline_asm() {
+; GFX9-LABEL: with_inline_asm
+; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3]]
+;
+; GFX10-LABEL: with_inline_asm
+; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3]]
+  call void asm sideeffect "; use $0", "a"(i32 poison)
+  ret void
+}
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI4]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI5]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI6]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI5]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI6]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
 ; GFX9:  attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 ; GFX9:  attributes #[[ATTR_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI7]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI2]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR_GFX9_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI3]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI4]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI3]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI4]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI5]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI6]] = { nofree norecurse nounwind memory(argmem: readwrite, inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI5]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI6]] = { nofree norecurse nounwind memory(inaccessiblemem: readwrite) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
 ; GFX10:  attributes #[[ATTR_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 ; GFX10:  attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI7]] = { nofree norecurse nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+
+; GFX10:  attributes #[[ATTR_GFX10_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index e911df4aa4da3b..eb4cf5c063d101 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -407,7 +407,7 @@ attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
 ; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index e1ce4d40b627d7..599328147ba805 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -148,7 +148,7 @@ define amdgpu_kernel void @kernel_lds() {
 
 define internal i16 @mutual_recursion_0(i16 %arg) {
 ; CHECK-LABEL: define internal i16 @mutual_recursion_0(
-; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR5:[0-9]+]] {
+; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; CHECK-NEXT:    [[RECURSIVE_KERNEL_LDS:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[RECURSIVE_KERNEL_LDS]], align 4
@@ -168,7 +168,7 @@ define internal i16 @mutual_recursion_0(i16 %arg) {
 
 define internal void @mutual_recursion_1(i16 %arg) {
 ; CHECK-LABEL: define internal void @mutual_recursion_1(
-; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR5]] {
+; CHECK-SAME: i16 [[ARG:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @mutual_recursion_0(i16 [[ARG]])
 ; CHECK-NEXT:    ret void
 ;
@@ -196,8 +196,6 @@ define amdgpu_kernel void @kernel_lds_recursion() {
 ; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.

>From a3c8eb9947c401648f89f337b1b3179f7093e64e Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Wed, 21 Aug 2024 11:27:13 -0700
Subject: [PATCH 4/8] Undo changes in AMDGPUSubtarget.cpp to reduce impact on
 test files. Those code changes will be in a follow-up PR.

---
 .../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll  |  368 +----
 .../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll  |  390 +----
 .../AMDGPU/GlobalISel/extractelement.ll       |   69 +-
 ...licit-kernarg-backend-usage-global-isel.ll |   36 +-
 .../GlobalISel/insertelement-stack-lower.ll   |    2 +-
 .../AMDGPU/GlobalISel/lds-global-value.ll     |    5 +-
 .../GlobalISel/llvm.amdgcn.if.break.i64.ll    |    3 -
 .../GlobalISel/llvm.amdgcn.trig.preop.ll      |   24 -
 .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll |   33 -
 .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll |   33 -
 .../abi-attribute-hints-undefined-behavior.ll |   18 +-
 llvm/test/CodeGen/AMDGPU/addrspacecast.ll     |   10 +-
 llvm/test/CodeGen/AMDGPU/always-uniform.ll    |    3 -
 ...amdgpu-codegenprepare-fold-binop-select.ll |    3 -
 .../CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll   |    4 +-
 .../attr-amdgpu-flat-work-group-size.ll       |    4 +-
 .../CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll    |    6 +-
 .../AMDGPU/attr-amdgpu-waves-per-eu.ll        |    4 +-
 .../attributor-flatscratchinit-globalisel.ll  |   54 +-
 llvm/test/CodeGen/AMDGPU/attributor-noopt.ll  |    2 +-
 .../callee-special-input-sgprs-fixed-abi.ll   |   36 +-
 llvm/test/CodeGen/AMDGPU/code-object-v3.ll    |   12 +-
 .../CodeGen/AMDGPU/combine-reg-or-const.ll    |    3 -
 ...dagcomb-extract-vec-elt-different-sizes.ll |    2 -
 ...cannot-create-empty-or-backward-segment.ll |    2 +-
 .../expand-scalar-carry-out-select-user.ll    |    3 -
 llvm/test/CodeGen/AMDGPU/fabs.f16.ll          |   66 -
 .../fast-unaligned-load-store.global.ll       |   20 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.ll     |  236 +--
 .../flat-for-global-subtarget-feature.ll      |    7 +-
 llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll  |   63 +-
 .../AMDGPU/fmul-2-combine-multi-use.ll        |   48 -
 llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll     |   60 -
 .../CodeGen/AMDGPU/fneg-modifier-casting.ll   |    3 -
 llvm/test/CodeGen/AMDGPU/fneg.f16.ll          |   62 +-
 llvm/test/CodeGen/AMDGPU/half.ll              |  231 ---
 .../AMDGPU/hsa-metadata-kernel-code-props.ll  |    9 +-
 llvm/test/CodeGen/AMDGPU/hsa.ll               |    4 +-
 .../AMDGPU/implicit-kernarg-backend-usage.ll  |   36 +-
 llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll   |   24 +-
 .../AMDGPU/insert_vector_elt.v2bf16.ll        |   58 +-
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll |  214 +--
 llvm/test/CodeGen/AMDGPU/kernarg-size.ll      |    2 +-
 .../AMDGPU/llvm.amdgcn.lds.kernel.id.ll       |    8 +-
 .../AMDGPU/llvm.amdgcn.readfirstlane.ll       |   70 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll    |  114 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll   |  126 +-
 llvm/test/CodeGen/AMDGPU/load-constant-f64.ll |    6 -
 llvm/test/CodeGen/AMDGPU/load-constant-i16.ll |  127 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i32.ll |   85 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i64.ll |   18 -
 llvm/test/CodeGen/AMDGPU/load-constant-i8.ll  |  164 +-
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   |  129 +-
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll   |   83 +-
 llvm/test/CodeGen/AMDGPU/load-select-ptr.ll   |    3 +-
 .../CodeGen/AMDGPU/mad24-get-global-id.ll     |    2 +-
 .../match-perm-extract-vector-elt-bug.ll      |    8 +-
 llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll    |   20 +-
 .../AMDGPU/memory-legalizer-flat-agent.ll     | 1380 -----------------
 .../memory-legalizer-flat-nontemporal.ll      |   75 -
 .../memory-legalizer-flat-singlethread.ll     | 1380 -----------------
 .../AMDGPU/memory-legalizer-flat-system.ll    | 1380 -----------------
 .../AMDGPU/memory-legalizer-flat-volatile.ll  |   66 -
 .../AMDGPU/memory-legalizer-flat-wavefront.ll | 1365 ----------------
 .../AMDGPU/memory-legalizer-flat-workgroup.ll | 1320 ----------------
 .../AMDGPU/memory-legalizer-global-agent.ll   |  273 ----
 .../memory-legalizer-global-nontemporal.ll    |   15 -
 .../memory-legalizer-global-singlethread.ll   |  276 ----
 .../AMDGPU/memory-legalizer-global-system.ll  |  261 ----
 .../memory-legalizer-global-volatile.ll       |   18 -
 .../memory-legalizer-global-wavefront.ll      |  276 ----
 .../memory-legalizer-global-workgroup.ll      |  276 ----
 .../memory-legalizer-local-nontemporal.ll     |    9 -
 .../AMDGPU/memory-legalizer-local-volatile.ll |    6 -
 .../memory-legalizer-private-nontemporal.ll   |   59 +-
 .../memory-legalizer-private-volatile.ll      |   30 +-
 llvm/test/CodeGen/AMDGPU/min.ll               |  210 ---
 llvm/test/CodeGen/AMDGPU/pack.v2f16.ll        |   21 -
 llvm/test/CodeGen/AMDGPU/pack.v2i16.ll        |   18 -
 ...al-regcopy-and-spill-missed-at-regalloc.ll |   56 +-
 llvm/test/CodeGen/AMDGPU/sad.ll               |   68 +-
 .../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll  |   16 -
 .../scc-clobbered-sgpr-to-vmem-spill.ll       |  456 +++---
 .../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll     |    2 +-
 llvm/test/CodeGen/AMDGPU/shift-i128.ll        |   24 +-
 llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll    |   70 +-
 .../CodeGen/AMDGPU/spill-vector-superclass.ll |    6 +-
 llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll |    6 -
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll  |    2 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll  |    2 +-
 llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll   |    9 +-
 llvm/test/CodeGen/AMDGPU/trap-abis.ll         |   16 +-
 llvm/test/CodeGen/AMDGPU/udiv.ll              |   45 -
 llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll    |   91 +-
 .../AMDGPU/vgpr-spill-placement-issue61083.ll |    2 +-
 ...ine-function-info-long-branch-reg-debug.ll |    7 +-
 .../machine-function-info-long-branch-reg.ll  |    7 +-
 97 files changed, 644 insertions(+), 12190 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index cb64c25b5f080e..705bcbddf227a6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -20,14 +20,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -38,14 +35,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -105,14 +99,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v0 offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -123,14 +114,11 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v0 offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -303,9 +291,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_dec_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -321,9 +306,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_dec_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -383,9 +365,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_dec_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -403,9 +382,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_dec_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -468,9 +444,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ; CI-LABEL: global_atomic_dec_ret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -488,9 +461,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
 ; VI-LABEL: global_atomic_dec_ret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -553,9 +523,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_dec_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -568,9 +535,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_dec_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -621,9 +585,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_dec_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -638,9 +599,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_dec_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -694,9 +652,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_dec_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -711,9 +666,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_dec_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -768,9 +720,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -778,7 +728,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -793,9 +742,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -803,7 +750,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -868,9 +814,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -888,9 +831,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -950,9 +890,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -968,9 +905,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -986,8 +920,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_ret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1002,10 +934,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1042,9 +970,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -1062,9 +987,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -1082,8 +1004,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1098,10 +1018,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1141,9 +1057,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; CI-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -1161,9 +1074,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; VI-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -1181,8 +1091,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1197,10 +1105,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1240,9 +1144,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1255,9 +1156,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1270,8 +1168,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1283,10 +1179,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1319,9 +1211,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -1336,9 +1225,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -1353,8 +1239,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1366,10 +1250,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1405,9 +1285,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -1422,9 +1299,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -1439,8 +1313,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1452,10 +1324,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1492,9 +1360,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1502,7 +1368,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -1517,9 +1382,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1527,7 +1390,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_dec v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -1542,8 +1404,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -1562,10 +1422,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 42
@@ -1622,9 +1478,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1642,9 +1495,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1662,8 +1512,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -1677,10 +1525,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1727,13 +1571,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1751,13 +1592,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1775,9 +1613,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_ret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -1792,10 +1628,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1834,15 +1666,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1860,15 +1689,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1886,9 +1712,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -1903,10 +1727,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1948,13 +1768,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1964,13 +1781,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1980,9 +1794,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -1994,10 +1806,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -2032,15 +1840,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_dec_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2050,15 +1855,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_dec_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2068,9 +1870,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -2082,10 +1882,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -2123,15 +1919,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2141,15 +1934,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2159,9 +1949,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -2173,10 +1961,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -2215,9 +1999,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2244,9 +2025,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2273,14 +2051,12 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2294,10 +2070,6 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2356,9 +2128,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2377,9 +2146,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2398,14 +2164,12 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_dec_x2 v[3:4], v[1:2] offset:40
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2414,10 +2178,6 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2471,11 +2231,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_dec_rtn_u32 v2, v1, v2 offset:8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    flat_store_dword v[0:1], v3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2492,11 +2249,8 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_dec_rtn_u32 v2, v1, v2 offset:8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2572,10 +2326,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -2591,10 +2342,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1]
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -2662,10 +2410,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -2681,10 +2426,7 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -2870,13 +2612,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_dec_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2889,13 +2628,10 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_dec_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -2955,15 +2691,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_dec_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -2976,15 +2709,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_dec_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -3045,15 +2775,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; CI-LABEL: global_atomic_dec_ret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3066,15 +2793,12 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
 ; VI-LABEL: global_atomic_dec_ret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -3135,13 +2859,10 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_dec_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3151,13 +2872,10 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_dec_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -3208,15 +2926,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_dec_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3226,15 +2941,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_dec_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -3286,15 +2998,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_dec_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -3304,15 +3013,12 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_dec_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -3365,9 +3071,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3391,9 +3094,6 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3470,9 +3170,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3491,9 +3188,6 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3564,10 +3258,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v4, s3
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v3, s2
 ; CI-NEXT:    flat_store_dword v[3:4], v0
@@ -3586,10 +3277,7 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v4, s3
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-NEXT:    flat_store_dword v[3:4], v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index 00ff2d7a35d561..b3a7e65f771c43 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -20,14 +20,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -38,14 +35,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -105,14 +99,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0 offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -123,14 +114,11 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0 offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -303,9 +291,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_inc_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -321,9 +306,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_inc_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -383,9 +365,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_inc_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -403,9 +382,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_inc_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -468,9 +444,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
 ; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -488,9 +461,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
 ; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -553,9 +523,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_inc_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -568,9 +535,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_inc_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -621,9 +585,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_inc_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -638,9 +599,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_inc_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -694,9 +652,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_inc_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -711,9 +666,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_inc_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -768,9 +720,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -778,7 +728,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -793,9 +742,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -803,7 +750,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -868,9 +814,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -888,9 +831,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -956,11 +896,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v2 offset:8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    flat_store_dword v[0:1], v3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -977,11 +914,8 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v2 offset:8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1057,10 +991,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -1076,10 +1007,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1]
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -1147,10 +1075,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CI-NEXT:    s_endpgm
@@ -1166,10 +1091,7 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
@@ -1355,13 +1277,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; CI-LABEL: global_atomic_inc_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1374,13 +1293,10 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
 ; VI-LABEL: global_atomic_inc_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1440,15 +1356,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; CI-LABEL: global_atomic_inc_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1461,15 +1374,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
 ; VI-LABEL: global_atomic_inc_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1530,15 +1440,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; CI-LABEL: global_atomic_inc_ret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1551,15 +1458,12 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
 ; VI-LABEL: global_atomic_inc_ret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1620,13 +1524,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; CI-LABEL: global_atomic_inc_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1636,13 +1537,10 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
 ; VI-LABEL: global_atomic_inc_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1693,15 +1591,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; CI-LABEL: global_atomic_inc_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1711,15 +1606,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
 ; VI-LABEL: global_atomic_inc_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1771,15 +1663,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; CI-LABEL: global_atomic_inc_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0)
@@ -1789,15 +1678,12 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
 ; VI-LABEL: global_atomic_inc_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -1850,9 +1736,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1876,9 +1759,6 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1955,9 +1835,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1976,9 +1853,6 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2042,9 +1916,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_ret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -2060,9 +1931,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_ret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -2078,8 +1946,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_ret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -2094,10 +1960,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2134,9 +1996,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_ret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -2154,9 +2013,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_ret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -2174,8 +2030,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -2190,10 +2044,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2233,9 +2083,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; CI-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 16
@@ -2253,9 +2100,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; VI-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 16
@@ -2273,8 +2117,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -2289,10 +2131,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2332,9 +2170,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2347,9 +2182,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2362,8 +2194,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -2375,10 +2205,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2411,9 +2237,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i32_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -2428,9 +2251,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i32_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -2445,8 +2265,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -2458,10 +2276,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2497,9 +2311,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    v_mov_b32_e32 v2, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 16
@@ -2514,9 +2325,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    v_mov_b32_e32 v2, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 16
@@ -2531,8 +2339,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -2544,10 +2350,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 42
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2584,9 +2386,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2594,7 +2394,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 20, v0
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, 42
 ; CI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_wbinvl1_vol
@@ -2609,9 +2408,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2619,7 +2416,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 20, v0
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, 42
 ; VI-NEXT:    flat_atomic_inc v3, v[0:1], v3 glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; VI-NEXT:    buffer_wbinvl1_vol
@@ -2634,8 +2430,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 42
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
@@ -2654,10 +2448,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 42
@@ -2714,9 +2504,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2734,9 +2521,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -2754,8 +2538,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
@@ -2769,10 +2551,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2826,10 +2604,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v4, s3
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, 2, v0
 ; CI-NEXT:    v_mov_b32_e32 v3, s2
 ; CI-NEXT:    flat_store_dword v[3:4], v0
@@ -2848,10 +2623,7 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v4, s3
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 2, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, s2
 ; VI-NEXT:    flat_store_dword v[3:4], v0
@@ -2924,13 +2696,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_ret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2948,13 +2717,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_ret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2972,9 +2738,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_ret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -2989,10 +2753,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3031,15 +2791,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_ret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3057,15 +2814,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_ret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3083,9 +2837,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -3100,10 +2852,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3145,15 +2893,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; CI-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s2, s2, 32
 ; CI-NEXT:    s_addc_u32 s3, s3, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3171,15 +2916,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; VI-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s2, s2, 32
 ; VI-NEXT:    s_addc_u32 s3, s3, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3197,9 +2939,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s2
@@ -3214,10 +2954,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3259,13 +2995,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3275,13 +3008,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3291,9 +3021,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -3305,10 +3033,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3343,15 +3067,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; CI-LABEL: flat_atomic_inc_noret_i64_offset:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3361,15 +3082,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; VI-LABEL: flat_atomic_inc_noret_i64_offset:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3379,9 +3097,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -3393,10 +3109,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3434,15 +3146,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; CI-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s0, s0, 32
 ; CI-NEXT:    s_addc_u32 s1, s1, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v1, 0
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; CI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3452,15 +3161,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; VI-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s0, s0, 32
 ; VI-NEXT:    s_addc_u32 s1, s1, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v1, 0
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3470,9 +3176,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
@@ -3484,10 +3188,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -3526,9 +3226,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3555,9 +3252,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3584,14 +3278,12 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 3, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v5
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3605,10 +3297,6 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
 ;
 ; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3667,9 +3355,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3688,9 +3373,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3709,14 +3391,12 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
 ; GFX9-NEXT:    flat_atomic_inc_x2 v[3:4], v[1:2] offset:40
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3725,10 +3405,6 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
 ;
 ; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3779,7 +3455,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, 42
 ; CI-NEXT:    s_mov_b32 m0, -1
-; CI-NEXT:    s_add_i32 s10, s10, s15
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
 ; CI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
@@ -3787,8 +3462,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; CI-NEXT:    ds_inc_rtn_u32 v3, v1, v0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
@@ -3802,7 +3475,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 42
 ; VI-NEXT:    s_mov_b32 m0, -1
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
 ; VI-NEXT:    ds_inc_rtn_u32 v2, v1, v0
@@ -3810,8 +3482,6 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
 ; VI-NEXT:    ds_inc_rtn_u32 v3, v1, v0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 868b530e42a219..34efb089b72bf1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3037,7 +3037,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     enable_mem_ordered = 0
 ; GPRIDX-NEXT:     enable_fwd_progress = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT:     user_sgpr_count = 12
+; GPRIDX-NEXT:     user_sgpr_count = 10
 ; GPRIDX-NEXT:     enable_trap_handler = 0
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -3052,7 +3052,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     enable_sgpr_queue_ptr = 0
 ; GPRIDX-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GPRIDX-NEXT:     enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 1
+; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_size = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -3069,7 +3069,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 15
+; GPRIDX-NEXT:     wavefront_sgpr_count = 13
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -3117,7 +3117,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
 ; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 2
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 1
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -3128,7 +3128,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     enable_mem_ordered = 0
 ; MOVREL-NEXT:     enable_fwd_progress = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT:     user_sgpr_count = 12
+; MOVREL-NEXT:     user_sgpr_count = 10
 ; MOVREL-NEXT:     enable_trap_handler = 0
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -3143,7 +3143,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     enable_sgpr_queue_ptr = 0
 ; MOVREL-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; MOVREL-NEXT:     enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 1
+; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_size = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -3160,7 +3160,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 22
+; MOVREL-NEXT:     wavefront_sgpr_count = 9
 ; MOVREL-NEXT:     workitem_vgpr_count = 4
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
@@ -3178,24 +3178,21 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:  ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; MOVREL-NEXT:    s_load_dword s8, s[6:7], 0x8
-; MOVREL-NEXT:    s_add_i32 s10, s10, s15
-; MOVREL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; MOVREL-NEXT:    s_mov_b32 s4, 0
 ; MOVREL-NEXT:    s_mov_b32 s5, 0x40080000
+; MOVREL-NEXT:    s_mov_b32 s2, 0
+; MOVREL-NEXT:    s_mov_b32 s3, 0x40140000
 ; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 1
 ; MOVREL-NEXT:    s_cselect_b64 s[6:7], 2.0, 1.0
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 2
-; MOVREL-NEXT:    s_mov_b32 s2, 0
 ; MOVREL-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 3
-; MOVREL-NEXT:    s_mov_b32 s3, 0x40140000
 ; MOVREL-NEXT:    s_cselect_b64 s[4:5], 4.0, s[4:5]
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 4
 ; MOVREL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, s2
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s1
-; MOVREL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; MOVREL-NEXT:    v_mov_b32_e32 v1, s3
 ; MOVREL-NEXT:    v_mov_b32_e32 v2, s0
 ; MOVREL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -3223,7 +3220,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     enable_mem_ordered = 1
 ; GFX10-NEXT:     enable_fwd_progress = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT:     user_sgpr_count = 12
+; GFX10-NEXT:     user_sgpr_count = 10
 ; GFX10-NEXT:     enable_trap_handler = 0
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -3238,7 +3235,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     enable_sgpr_queue_ptr = 0
 ; GFX10-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GFX10-NEXT:     enable_sgpr_dispatch_id = 1
-; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 1
+; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_size = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4057,7 +4054,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_mem_ordered = 0
 ; GPRIDX-NEXT:     enable_fwd_progress = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT:     user_sgpr_count = 12
+; GPRIDX-NEXT:     user_sgpr_count = 10
 ; GPRIDX-NEXT:     enable_trap_handler = 0
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4072,7 +4069,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_sgpr_queue_ptr = 0
 ; GPRIDX-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GPRIDX-NEXT:     enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 1
+; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_size = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4089,7 +4086,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 14
+; GPRIDX-NEXT:     wavefront_sgpr_count = 12
 ; GPRIDX-NEXT:     workitem_vgpr_count = 2
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -4130,7 +4127,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
 ; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 2
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 0
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -4141,7 +4138,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_mem_ordered = 0
 ; MOVREL-NEXT:     enable_fwd_progress = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT:     user_sgpr_count = 12
+; MOVREL-NEXT:     user_sgpr_count = 10
 ; MOVREL-NEXT:     enable_trap_handler = 0
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4156,7 +4153,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_sgpr_queue_ptr = 0
 ; MOVREL-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; MOVREL-NEXT:     enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 1
+; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_size = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4173,7 +4170,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 22
+; MOVREL-NEXT:     wavefront_sgpr_count = 8
 ; MOVREL-NEXT:     workitem_vgpr_count = 3
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
@@ -4191,9 +4188,6 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:  ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; MOVREL-NEXT:    s_add_i32 s10, s10, s15
-; MOVREL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; MOVREL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; MOVREL-NEXT:    s_cmp_eq_u32 s2, 1
 ; MOVREL-NEXT:    s_cselect_b32 s3, 2.0, 1.0
@@ -4229,7 +4223,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_mem_ordered = 1
 ; GFX10-NEXT:     enable_fwd_progress = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT:     user_sgpr_count = 12
+; GFX10-NEXT:     user_sgpr_count = 10
 ; GFX10-NEXT:     enable_trap_handler = 0
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4244,7 +4238,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_sgpr_queue_ptr = 0
 ; GFX10-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GFX10-NEXT:     enable_sgpr_dispatch_id = 1
-; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 1
+; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_size = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4407,7 +4401,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_mem_ordered = 0
 ; GPRIDX-NEXT:     enable_fwd_progress = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT:     user_sgpr_count = 12
+; GPRIDX-NEXT:     user_sgpr_count = 10
 ; GPRIDX-NEXT:     enable_trap_handler = 0
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GPRIDX-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4422,7 +4416,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     enable_sgpr_queue_ptr = 0
 ; GPRIDX-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GPRIDX-NEXT:     enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 1
+; GPRIDX-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; GPRIDX-NEXT:     enable_sgpr_private_segment_size = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GPRIDX-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4439,7 +4433,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GPRIDX-NEXT:     gds_segment_byte_size = 0
 ; GPRIDX-NEXT:     kernarg_segment_byte_size = 28
 ; GPRIDX-NEXT:     workgroup_fbarrier_count = 0
-; GPRIDX-NEXT:     wavefront_sgpr_count = 15
+; GPRIDX-NEXT:     wavefront_sgpr_count = 13
 ; GPRIDX-NEXT:     workitem_vgpr_count = 3
 ; GPRIDX-NEXT:     reserved_vgpr_first = 0
 ; GPRIDX-NEXT:     reserved_vgpr_count = 0
@@ -4483,7 +4477,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     kernel_code_entry_byte_offset = 256
 ; MOVREL-NEXT:     kernel_code_prefetch_byte_size = 0
 ; MOVREL-NEXT:     granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 2
+; MOVREL-NEXT:     granulated_wavefront_sgpr_count = 1
 ; MOVREL-NEXT:     priority = 0
 ; MOVREL-NEXT:     float_mode = 240
 ; MOVREL-NEXT:     priv = 0
@@ -4494,7 +4488,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_mem_ordered = 0
 ; MOVREL-NEXT:     enable_fwd_progress = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT:     user_sgpr_count = 12
+; MOVREL-NEXT:     user_sgpr_count = 10
 ; MOVREL-NEXT:     enable_trap_handler = 0
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; MOVREL-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4509,7 +4503,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     enable_sgpr_queue_ptr = 0
 ; MOVREL-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; MOVREL-NEXT:     enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 1
+; MOVREL-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; MOVREL-NEXT:     enable_sgpr_private_segment_size = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; MOVREL-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
@@ -4526,7 +4520,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:     gds_segment_byte_size = 0
 ; MOVREL-NEXT:     kernarg_segment_byte_size = 28
 ; MOVREL-NEXT:     workgroup_fbarrier_count = 0
-; MOVREL-NEXT:     wavefront_sgpr_count = 22
+; MOVREL-NEXT:     wavefront_sgpr_count = 9
 ; MOVREL-NEXT:     workitem_vgpr_count = 4
 ; MOVREL-NEXT:     reserved_vgpr_first = 0
 ; MOVREL-NEXT:     reserved_vgpr_count = 0
@@ -4544,12 +4538,10 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:  ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_load_dword s8, s[6:7], 0x8
 ; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; MOVREL-NEXT:    s_add_i32 s10, s10, s15
-; MOVREL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; MOVREL-NEXT:    s_mov_b32 s2, 0
+; MOVREL-NEXT:    s_mov_b32 s3, 0x40080000
 ; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 1
-; MOVREL-NEXT:    s_mov_b32 s3, 0x40080000
 ; MOVREL-NEXT:    s_cselect_b64 s[4:5], 2.0, 1.0
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 2
 ; MOVREL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
@@ -4557,7 +4549,6 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; MOVREL-NEXT:    s_cselect_b64 s[2:3], 4.0, s[2:3]
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, s2
 ; MOVREL-NEXT:    v_mov_b32_e32 v3, s1
-; MOVREL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; MOVREL-NEXT:    v_mov_b32_e32 v1, s3
 ; MOVREL-NEXT:    v_mov_b32_e32 v2, s0
 ; MOVREL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -4585,7 +4576,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_mem_ordered = 1
 ; GFX10-NEXT:     enable_fwd_progress = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT:     user_sgpr_count = 12
+; GFX10-NEXT:     user_sgpr_count = 10
 ; GFX10-NEXT:     enable_trap_handler = 0
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_x = 1
 ; GFX10-NEXT:     enable_sgpr_workgroup_id_y = 1
@@ -4600,7 +4591,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
 ; GFX10-NEXT:     enable_sgpr_queue_ptr = 0
 ; GFX10-NEXT:     enable_sgpr_kernarg_segment_ptr = 1
 ; GFX10-NEXT:     enable_sgpr_dispatch_id = 1
-; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 1
+; GFX10-NEXT:     enable_sgpr_flat_scratch_init = 0
 ; GFX10-NEXT:     enable_sgpr_private_segment_size = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_x = 0
 ; GFX10-NEXT:     enable_sgpr_grid_workgroup_count_y = 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 70b889389ff998..9443b39dcdc033 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -12,9 +12,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x40
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_mov_b32 s4, s0
 ; GFX8V4-NEXT:    s_mov_b32 s5, s3
@@ -25,7 +23,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], s[6:7], 0
-; GFX8V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8V4-NEXT:    s_waitcnt vmcnt(0)
@@ -40,9 +37,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0xc8
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_mov_b32 s4, s0
 ; GFX8V5-NEXT:    s_mov_b32 s5, s2
@@ -52,7 +47,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], s[2:3], 0
-; GFX8V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8V5-NEXT:    s_waitcnt vmcnt(0)
@@ -66,10 +60,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_mov_b32 s2, s0
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
@@ -78,7 +71,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V4-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
-; GFX9V4-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V4-NEXT:    s_waitcnt vmcnt(0)
@@ -92,10 +84,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
 ; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_mov_b32 s2, s0
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
@@ -104,7 +95,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s1, -1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9V5-NEXT:    s_cselect_b64 s[0:1], s[4:5], 0
-; GFX9V5-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v2
 ; GFX9V5-NEXT:    s_waitcnt vmcnt(0)
@@ -127,9 +117,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x40
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
@@ -143,9 +130,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xcc
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b32 s0, 1, 0
@@ -189,9 +173,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x44
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b32 s0, 1, 0
@@ -205,9 +186,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xc8
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b32 s0, 1, 0
@@ -291,10 +269,7 @@ define amdgpu_kernel void @llvm_debugtrap() {
 define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8V4-NEXT:    s_add_u32 s0, s8, 8
 ; GFX8V4-NEXT:    flat_load_ubyte v0, v[0:1] glc
@@ -320,9 +295,6 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8V5-NEXT:    s_add_u32 s0, s6, 8
 ; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V5-NEXT:    s_addc_u32 s1, s7, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index 7aa3b5bb109903..4fcde0f2fc7cf1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -9,7 +9,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[20:23], s[6:7], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[24:25], s[6:7], 0x10
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s13
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    v_mov_b32_e32 v64, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
index 6ed2df430998f8..c3938e673a6da6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
@@ -11,16 +11,13 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace(
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 4
 ; CHECK-NEXT:    s_mov_b32 m0, -1
-; CHECK-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-NEXT:    ds_read_b32 v2, v0
-; CHECK-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; CHECK-NEXT:    v_mov_b32_e32 v3, 9
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_add_u32 s0, s0, 4
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-NEXT:    v_mov_b32_e32 v3, 9
 ; CHECK-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x200
 ; CHECK-NEXT:    ds_write_b32 v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
index 3a90c3ee908033..d7a82b415ff06c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
@@ -6,9 +6,6 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0xa
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_cmp_eq_u32 s2, 0
 ; GCN-NEXT:    s_cselect_b32 s2, 1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
index 96fd14f52d13b0..f3654fea486e0c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
@@ -42,9 +42,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -62,9 +59,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -82,8 +76,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -93,10 +85,6 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
 ;
 ; GFX10-LABEL: s_trig_preop_f64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    s_load_dword s2, s[6:7], 0x8
@@ -125,9 +113,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ; CI-LABEL: s_trig_preop_f64_imm:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
 ; CI-NEXT:    s_add_u32 s0, s0, 4
@@ -143,9 +128,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ; VI-LABEL: s_trig_preop_f64_imm:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
 ; VI-NEXT:    s_add_u32 s0, s0, 4
@@ -161,8 +143,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ; GFX9-LABEL: s_trig_preop_f64_imm:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
 ; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[0:1]
@@ -171,10 +151,6 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
 ;
 ; GFX10-LABEL: s_trig_preop_f64_imm:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_trig_preop_f64 v[0:1], s[0:1], 7
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index f0ec0d101f5bec..7d7f450e590faa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -7,9 +7,6 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: sdivrem_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_ashr_i32 s8, s5, 31
 ; GFX8-NEXT:    s_add_i32 s0, s5, s8
@@ -148,9 +145,6 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
 ; GFX8-LABEL: sdivrem_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_ashr_i32 s2, s9, 31
@@ -622,9 +616,6 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
 ; GFX8-LABEL: sdivrem_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_ashr_i32 s2, s10, 31
@@ -854,9 +845,6 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
 ; GFX8-LABEL: sdivrem_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1283,9 +1271,6 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
 ; GFX8-LABEL: sdivrem_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x0
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x20
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2203,9 +2188,6 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
 ; GFX8-LABEL: sdiv_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s4, 0x80008
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
@@ -2351,9 +2333,6 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: sdivrem_v2i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s2, 0x80010
 ; GFX8-NEXT:    s_ashr_i32 s3, s0, 31
@@ -2618,9 +2597,6 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
 ; GFX8-LABEL: sdiv_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s4, 0x100010
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
@@ -2766,9 +2742,6 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-LABEL: sdivrem_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s3
 ; GFX8-NEXT:    s_ashr_i32 s8, s0, 31
@@ -3030,9 +3003,6 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
 ; GFX8-LABEL: sdivrem_i3:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s4, 0x30008
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
@@ -3184,9 +3154,6 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: sdivrem_i27:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_i32 s0, s5, 0x1b0000
 ; GFX8-NEXT:    s_ashr_i32 s5, s0, 31
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index 2be04ace99e36d..5aef6679347094 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -7,9 +7,6 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: udivrem_i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s5
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s5
@@ -115,9 +112,6 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, i64 %x, i64 %y) {
 ; GFX8-LABEL: udivrem_i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s11
@@ -528,9 +522,6 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i32> %x, <2 x i32> %y) {
 ; GFX8-LABEL: udivrem_v2i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s10
@@ -694,9 +685,6 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
 ; GFX8-LABEL: udivrem_v4i32:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -991,10 +979,7 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
 define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
 ; GFX8-LABEL: udivrem_v2i64:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
 ; GFX8-NEXT:    s_load_dwordx4 s[12:15], s[6:7], 0x20
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[6:7], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s13
@@ -1787,9 +1772,6 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
 ; GFX8-LABEL: udiv_i8:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x80008
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
@@ -1903,9 +1885,6 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s0, s[6:7], 0x10
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s2, s0, 0x80010
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
@@ -2102,9 +2081,6 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
 ; GFX8-LABEL: udiv_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_lshr_b32 s5, s4, 16
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s5
@@ -2217,10 +2193,7 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-LABEL: udivrem_v2i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[6:7], 0x0
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_and_b32 s2, s1, 0xffff
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
@@ -2414,9 +2387,6 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
 ; GFX8-LABEL: udivrem_i3:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_bfe_u32 s5, s4, 0x30008
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
@@ -2535,9 +2505,6 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-LABEL: udivrem_i27:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_and_b32 s5, s5, 0x7ffffff
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s5
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index b8ffa4f14c3e5c..e53653408feb40 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -135,9 +135,6 @@ define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr)
 ; FIXEDABI-LABEL: marked_kernel_use_workitem_id:
 ; FIXEDABI:       ; %bb.0:
 ; FIXEDABI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; FIXEDABI-NEXT:    s_add_i32 s6, s6, s11
-; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v4, s1
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v3, s0
@@ -184,19 +181,16 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr)
 ; FIXEDABI-LABEL: marked_kernel_use_workgroup_id:
 ; FIXEDABI:       ; %bb.0:
 ; FIXEDABI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; FIXEDABI-NEXT:    s_add_i32 s6, s6, s11
-; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s8
+; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s6
 ; FIXEDABI-NEXT:    s_waitcnt lgkmcnt(0)
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v0, s0
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v1, s1
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v2
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s9
+; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s7
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v2
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s10
+; FIXEDABI-NEXT:    v_mov_b32_e32 v2, s8
 ; FIXEDABI-NEXT:    flat_store_dword v[0:1], v2
 ; FIXEDABI-NEXT:    s_waitcnt vmcnt(0)
 ; FIXEDABI-NEXT:    s_endpgm
@@ -244,9 +238,6 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
 ; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
 ; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_add_i32 s6, s6, s11
-; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s7
-; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s6, 8
 ; FIXEDABI-NEXT:    s_add_u32 s0, s4, 8
 ; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; FIXEDABI-NEXT:    s_addc_u32 s1, s5, 0
@@ -270,10 +261,7 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #
 define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 {
 ; FIXEDABI-LABEL: marked_kernel_nokernargs_implicitarg_ptr:
 ; FIXEDABI:       ; %bb.0:
-; FIXEDABI-NEXT:    s_add_i32 s4, s4, s9
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v0, 0
-; FIXEDABI-NEXT:    s_mov_b32 flat_scratch_lo, s5
-; FIXEDABI-NEXT:    s_lshr_b32 flat_scratch_hi, s4, 8
 ; FIXEDABI-NEXT:    v_mov_b32_e32 v1, 0
 ; FIXEDABI-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; FIXEDABI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
index 42494efe27a824..4b1484e9bd958e 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -44,7 +44,7 @@ define void @use_group_to_flat_addrspacecast_func(ptr addrspace(3) %ptr) #0 {
 ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
 ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
 ; SI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
+; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
 ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; HSA: buffer_store_dword [[K]], [[PTR]], s[[[BASELO]]:[[RSRCHI]]], 0 offen
@@ -226,8 +226,8 @@ define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
 ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
 ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
 ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
-; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
+; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
+; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
 ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: buffer_store_dword v[[K]], off, s[[[BASELO]]:[[RSRCHI]]], 0
@@ -251,8 +251,8 @@ define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
 
 ; HSA-DAG: s_mov_b64 s[{{[0-9]+}}:[[RSRCHI:[0-9]+]]], s[2:3]
 ; HSA-DAG: s_mov_b64 s[[[BASELO:[0-9]+]]:[[BASEHI:[0-9]+]]], s[0:1]
-; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s9
-; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
+; CI-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s7
+; GFX9-DAG: s_add_u32 s[[BASELO]], s[[BASELO]], s5
 ; HSA-DAG: s_addc_u32 s[[BASEHI]], s[[BASEHI]], 0
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
index e0c69706bad79c..0a461f9ee6c968 100644
--- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
@@ -8,10 +8,8 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
 ; GCN-LABEL: readfirstlane_uniform:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NEXT:    s_mov_b32 s5, 0
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], 2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s0, s0, s4
@@ -20,7 +18,6 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
 ; GCN-NEXT:    s_add_u32 s0, s2, 40
 ; GCN-NEXT:    s_addc_u32 s1, s3, 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 89c5303e0e81d1..bf72cccd912cee 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -393,14 +393,11 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
 ; GCN-LABEL: select_add_lhs_const_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x83
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x80
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_bitcmp1_b32 s0, 0
 ; GCN-NEXT:    s_cselect_b64 vcc, -1, 0
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    flat_store_short v[0:1], v0
 ; GCN-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
index 54a800ecee9f17..a6d8c6f41eee59 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE
 
 ; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
-; TRAP-HANDLER-ENABLE:  NumSgprs: 83
-; TRAP-HANDLER-DISABLE: NumSgprs: 98
+; TRAP-HANDLER-ENABLE:  NumSgprs: 77
+; TRAP-HANDLER-DISABLE: NumSgprs: 92
 define amdgpu_kernel void @amdhsa_trap_num_sgprs(
     ptr addrspace(1) %out0, i32 %in0,
     ptr addrspace(1) %out1, i32 %in1,
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index 22cc5af30da662..fc13b86566f76c 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -35,9 +35,9 @@ entry:
 attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
 
 ; CHECK-LABEL: {{^}}min_1024_max_1024
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 0
 ; CHECK: VGPRBlocks: 10
-; CHECK: NumSGPRsForWavesPerEU: 24{{$}}
+; CHECK: NumSGPRsForWavesPerEU: 2{{$}}
 ; CHECK: NumVGPRsForWavesPerEU: 43
 @var = addrspace(1) global float 0.0
 define amdgpu_kernel void @min_1024_max_1024() #3 {
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index 3ddf8be052e4ab..ed045107d354dc 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -4,8 +4,8 @@
 
 ; ALL-LABEL: {{^}}max_10_sgprs:
 
-; ALL: SGPRBlocks: 2
-; ALL: NumSGPRsForWavesPerEU: 22
+; ALL: SGPRBlocks: 1
+; ALL: NumSGPRsForWavesPerEU: 10
 define amdgpu_kernel void @max_10_sgprs() #0 {
   %one = load volatile i32, ptr addrspace(4) undef
   %two = load volatile i32, ptr addrspace(4) undef
@@ -125,7 +125,7 @@ declare i64 @llvm.amdgcn.dispatch.id() #1
 declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #1
 declare ptr addrspace(4) @llvm.amdgcn.queue.ptr() #1
 
-attributes #0 = { nounwind "amdgpu-num-sgpr"="18" }
+attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind "amdgpu-num-sgpr"="12" }
 attributes #3 = { nounwind "amdgpu-num-sgpr"="11" }
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 26dc3e13a72cf0..14519f5a5e77c0 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -116,9 +116,9 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"}
 
 ; Exactly 10 waves per execution unit.
 ; CHECK-LABEL: {{^}}exactly_10:
-; CHECK: SGPRBlocks: 3
+; CHECK: SGPRBlocks: 2
 ; CHECK: VGPRBlocks: 5
-; CHECK: NumSGPRsForWavesPerEU: 26
+; CHECK: NumSGPRsForWavesPerEU: 20
 ; CHECK: NumVGPRsForWavesPerEU: 24
 define amdgpu_kernel void @exactly_10() #9 {
   %val0 = load volatile float, ptr addrspace(1) @var
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
index f9f48aacfc2bf7..b7503f26b1ab6d 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
@@ -503,38 +503,31 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ;
 ; GFX10: name:            with_alloca_cc_vs
 ; GFX10:       argumentInfo:
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
 ;
 ; GFX10: name:            with_alloca_cc_gs
 ; GFX10:       argumentInfo:
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
 ; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr5' }
 ;
 ; GFX10: name:            with_alloca_cc_ps
 ; GFX10:       argumentInfo:
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
 ;
 ; GFX10: name:            with_alloca_cc_cs
 ; GFX10:       argumentInfo:
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
 ;
 ; GFX10: name:            with_alloca_cc_hs
 ; GFX10:       argumentInfo:
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
 ; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr5' }
 ;
 ; GFX10: name:            with_alloca_cc_ls
 ; GFX10:       argumentInfo:
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
 ;
 ; GFX10: name:            with_alloca_cc_es
 ; GFX10:       argumentInfo:
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr0_sgpr1' }
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr2' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
 ;
 ; GFX10: name:            with_alloca_cc_gfx
 ; GFX10:       argumentInfo:
@@ -701,8 +694,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
+; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr7' }
 ;
 ; GFX10: name:            without_region_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -730,8 +723,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            without_group_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -759,8 +751,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            without_constant_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -788,8 +779,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            without_private_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -817,8 +807,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            call_without_private_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -846,8 +835,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            call_both_with_and_without_private_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -861,8 +849,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            call_call_without_private_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -890,8 +877,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            call_call_both_with_and_without_private_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -905,8 +891,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            with_cast_call_without_private_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -920,8 +905,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            with_cast_call_with_private_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
@@ -935,8 +919,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            call_without_alloca_and_without_addrspacecast
 ; GFX10:       argumentInfo:
@@ -964,8 +947,7 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr8' }
+; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
 ; GFX10: name:            with_indirect_call
 ; GFX10:       argumentInfo:
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
index 470c444ad8cd74..90562e25a3e9c1 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
@@ -23,7 +23,7 @@
 ; COV5: .amdhsa_user_sgpr_queue_ptr 0
 ; NOOPT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; NOOPT: .amdhsa_user_sgpr_dispatch_id 1
-; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 1
+; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; NOOPT: .amdhsa_user_sgpr_private_segment_size 0
 ; NOOPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
 ; NOOPT: .amdhsa_system_sgpr_workgroup_id_x 1
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 1511988ad61c19..032ec65fa85133 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -192,11 +192,11 @@ define hidden void @use_workgroup_id_yz() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x:
 ; GCN-NOT: s6
-; GCN: s_mov_b32 s12, s4
+; GCN: s_mov_b32 s12, s6
 ; GCN: s_mov_b32 s32, 0
-; GCN: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x at rel32@hi+12
+; GCN: s_getpc_b64 s[4:5]
+; GCN-NEXT: s_add_u32 s4, s4, use_workgroup_id_x at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s5, s5, use_workgroup_id_x at rel32@hi+12
 ; GCN: s_swappc_b64
 ; GCN-NEXT: s_endpgm
 
@@ -210,7 +210,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y:
 ; GCN-NOT: s12
-; GCN: s_mov_b32 s13, s5
+; GCN: s_mov_b32 s13, s7
 ; GCN-NOT: s12
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -226,7 +226,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 {
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z:
 ; GCN-NOT: s12
 ; GCN-NOT: s13
-; GCN: s_mov_b32 s14, s5
+; GCN: s_mov_b32 s14, s7
 ; GCN-NOT: s12
 ; GCN-NOT: s13
 
@@ -243,8 +243,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy:
 ; GCN-NOT: s14
-; GCN: s_mov_b32 s12, s4
-; GCN-NEXT: s_mov_b32 s13, s5
+; GCN: s_mov_b32 s12, s6
+; GCN-NEXT: s_mov_b32 s13, s7
 ; GCN-NOT: s14
 
 ; GCN: s_mov_b32 s32, 0
@@ -259,9 +259,9 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 {
 }
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz:
-; GCN: s_mov_b32 s12, s4
-; GCN: s_mov_b32 s13, s5
-; GCN: s_mov_b32 s14, s6
+; GCN: s_mov_b32 s12, s6
+; GCN: s_mov_b32 s13, s7
+; GCN: s_mov_b32 s14, s8
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
 
@@ -276,8 +276,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 {
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz:
 
 ; GCN-NOT: s13
-; GCN: s_mov_b32 s12, s4
-; GCN-NEXT: s_mov_b32 s14, s5
+; GCN: s_mov_b32 s12, s6
+; GCN-NEXT: s_mov_b32 s14, s7
 ; GCN-NOT: s13
 
 ; GCN: s_mov_b32 s32, 0
@@ -293,8 +293,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz:
 
-; GCN: s_mov_b32 s13, s5
-; GCN: s_mov_b32 s14, s6
+; GCN: s_mov_b32 s13, s7
+; GCN: s_mov_b32 s14, s8
 
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -375,7 +375,7 @@ define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
 
 ; GCN-NOT: s13
 ; GCN-NOT: s14
-; GCN-DAG: s_mov_b32 s12, s4
+; GCN-DAG: s_mov_b32 s12, s6
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
 ; GCN-NOT: s13
 ; GCN-NOT: s14
@@ -393,7 +393,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y:
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: s_mov_b32 s13, s5
+; GCN-DAG: s_mov_b32 s13, s7
 
 ; GCN-DAG: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
@@ -408,7 +408,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 {
 
 ; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z:
 ; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: s_mov_b32 s14, s5
+; GCN-DAG: s_mov_b32 s14, s7
 
 ; GCN: s_mov_b32 s32, 0
 ; GCN: s_swappc_b64
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index 48ad2fe687804c..3035a8579c8a6d 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -12,13 +12,13 @@
 ; OSABI-AMDHSA-ASM: .section .rodata,"a"
 ; OSABI-AMDHSA-ASM: .p2align 6
 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fadd
-; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 12
+; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 10
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 8
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 1
+; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
 ; OSABI-AMDHSA-ASM: .text
 
@@ -31,13 +31,13 @@
 ; OSABI-AMDHSA-ASM: .section .rodata,"a"
 ; OSABI-AMDHSA-ASM: .p2align 6
 ; OSABI-AMDHSA-ASM: .amdhsa_kernel fsub
-; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 12
+; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_count 10
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_user_sgpr_kernarg_segment_ptr 1
 ; OSABI-AMDHSA-ASM:     .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 16
+; OSABI-AMDHSA-ASM:     .amdhsa_next_free_sgpr 8
 ; OSABI-AMDHSA-ASM:     .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 1
+; OSABI-AMDHSA-ASM:     .amdhsa_reserve_flat_scratch 0
 ; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
 ; OSABI-AMDHSA-ASM: .text
 
diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
index c97d3338006022..5fbcd0bf669995 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
@@ -5,9 +5,6 @@
 define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 {
 ; CHECK-LABEL: _Z11test_kernelPii:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 3
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
index f586f6d16e0efd..297fe7618672e6 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
@@ -6,8 +6,6 @@
 define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) {
 ; CHECK-LABEL: eggs:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dwordx16 s[8:23], s[6:7], 0x8
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
index 39328d706ba260..85ed2914b8c7f5 100644
--- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
@@ -9,7 +9,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-NEXT:    s_load_dword s14, s[6:7], 0x4
-; CHECK-NEXT:    s_add_u32 s24, s24, s15
+; CHECK-NEXT:    s_add_u32 s24, s24, s13
 ; CHECK-NEXT:    s_addc_u32 s25, s25, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_bitcmp1_b32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index 05a245cd3443c4..54fb1dc5c05274 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -94,9 +94,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-LABEL: s_add_co_br_user:
 ; GFX7:       ; %bb.0: ; %bb
 ; GFX7-NEXT:    s_load_dword s2, s[6:7], 0x0
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_add_i32 s0, s2, s2
 ; GFX7-NEXT:    s_cmp_lt_u32 s0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 7fde702bd49ac8..21799ab79b8396 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -13,9 +13,6 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -28,9 +25,6 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -74,9 +68,6 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -89,9 +80,6 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -134,9 +122,6 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -149,9 +134,6 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -193,9 +175,6 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; CI-LABEL: s_fabs_v4f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
@@ -209,9 +188,6 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
 ; VI-LABEL: s_fabs_v4f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s3, s3, 0x7fff7fff
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
@@ -256,9 +232,6 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
 ; CI-LABEL: fabs_fold_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -276,9 +249,6 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
@@ -325,9 +295,6 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -342,9 +309,6 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -394,9 +358,6 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -409,9 +370,6 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fff7fff
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -457,9 +415,6 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
@@ -486,9 +441,6 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -547,12 +499,9 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dword v0, v[0:1]
 ; CI-NEXT:    s_lshr_b32 s2, s4, 16
@@ -578,12 +527,9 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v2, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -643,9 +589,6 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -669,9 +612,6 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -742,9 +682,6 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -763,9 +700,6 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 4df6b8d0669158..7252c69cb1cf75 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -74,9 +74,6 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align2:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
-; GFX7-ALIGNED-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-ALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-ALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 1
 ; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -93,9 +90,6 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
 ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
-; GFX7-UNALIGNED-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-UNALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-UNALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -226,10 +220,8 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align1:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
-; GFX7-ALIGNED-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-ALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-ALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 1
+; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    s_add_u32 s2, s0, 2
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -239,7 +231,6 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v2
 ; GFX7-ALIGNED-NEXT:    s_addc_u32 s5, s1, 0
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s4
-; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX7-ALIGNED-NEXT:    s_add_u32 s0, s0, 3
 ; GFX7-ALIGNED-NEXT:    flat_store_byte v[0:1], v3
@@ -256,9 +247,6 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
 ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
-; GFX7-UNALIGNED-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-UNALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-UNALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -371,9 +359,6 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
 ; GFX7-ALIGNED-LABEL: global_store_2xi16_align4:
 ; GFX7-ALIGNED:       ; %bb.0:
 ; GFX7-ALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
-; GFX7-ALIGNED-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-ALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-ALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-ALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-ALIGNED-NEXT:    v_mov_b32_e32 v0, s0
@@ -384,9 +369,6 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
 ; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4:
 ; GFX7-UNALIGNED:       ; %bb.0:
 ; GFX7-UNALIGNED-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
-; GFX7-UNALIGNED-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-UNALIGNED-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-UNALIGNED-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v2, 0x20001
 ; GFX7-UNALIGNED-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-UNALIGNED-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 6482749bd2fb7e..f0ce96af90649d 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -24,9 +24,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1
 ; GFX678-LABEL: v_test_canonicalize_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -83,9 +80,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
@@ -97,9 +91,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -149,9 +140,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fabs_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -208,9 +196,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1
 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -268,9 +253,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fneg_var_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -327,9 +309,6 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou
 ; GFX678-LABEL: test_fold_canonicalize_undef_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -373,9 +352,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -419,9 +395,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -468,9 +441,6 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 1.0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -515,9 +485,6 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, -1.0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -562,9 +529,6 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %
 ; GFX678-LABEL: test_fold_canonicalize_literal_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x41800000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -609,9 +573,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -655,13 +616,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
-; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX678-NEXT:    flat_store_dword v[0:1], v2
 ; GFX678-NEXT:    s_endpgm
@@ -706,13 +664,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
-; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX678-NEXT:    flat_store_dword v[0:1], v2
 ; GFX678-NEXT:    s_endpgm
@@ -757,13 +712,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
-; GFX678-NEXT:    v_mul_f32_e64 v2, 1.0, s2
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX678-NEXT:    flat_store_dword v[0:1], v2
 ; GFX678-NEXT:    s_endpgm
@@ -808,9 +760,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -855,9 +804,6 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_bfrev_b32_e32 v2, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -904,9 +850,6 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x807fffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -951,9 +894,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out
 ; GFX678-LABEL: test_fold_canonicalize_qnan_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -998,9 +938,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1045,9 +982,6 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1092,9 +1026,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1139,9 +1070,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1186,9 +1114,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1233,9 +1158,6 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
@@ -1280,9 +1202,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1
 ; GFX678-LABEL: v_test_canonicalize_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1338,9 +1257,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
 ; GFX6-LABEL: s_test_canonicalize_var_f64:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_max_f64 v[2:3], s[2:3], s[2:3]
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
@@ -1351,9 +1267,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
 ; GFX8-LABEL: s_test_canonicalize_var_f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_max_f64 v[0:1], s[2:3], s[2:3]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
@@ -1400,9 +1313,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fabs_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1459,9 +1369,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1
 ; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1519,9 +1426,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou
 ; GFX678-LABEL: v_test_canonicalize_fneg_var_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX678-NEXT:    v_mov_b32_e32 v1, s1
@@ -1578,13 +1482,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX678-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1630,13 +1531,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1680,13 +1578,10 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_p1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1728,13 +1623,10 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out)
 ; GFX678-LABEL: test_fold_canonicalize_n1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0xbff00000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0xbff00000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1776,13 +1668,10 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %
 ; GFX678-LABEL: test_fold_canonicalize_literal_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x40300000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x40300000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1824,13 +1713,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX678-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1876,13 +1762,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, -1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0xfffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0xfffff
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1927,13 +1810,10 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr
 ; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_bfrev_b32_e32 v1, 1
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -1977,13 +1857,10 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad
 ; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, -1
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x800fffff
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x800fffff
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2028,13 +1905,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out
 ; GFX678-LABEL: test_fold_canonicalize_qnan_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2076,13 +1950,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2124,13 +1995,10 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp
 ; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2172,13 +2040,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2220,13 +2085,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2268,13 +2130,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2316,13 +2175,10 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(
 ; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64:
 ; GFX678:       ; %bb.0:
 ; GFX678-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX678-NEXT:    s_add_i32 s10, s10, s15
-; GFX678-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX678-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX678-NEXT:    v_mov_b32_e32 v0, 0
+; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX678-NEXT:    v_mov_b32_e32 v3, s1
-; GFX678-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
 ; GFX678-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX678-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX678-NEXT:    s_endpgm
@@ -2365,9 +2221,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2385,9 +2238,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2455,9 +2305,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2475,9 +2322,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2545,9 +2389,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2566,9 +2407,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2637,9 +2475,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2663,9 +2498,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2736,9 +2568,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2756,9 +2585,6 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2826,9 +2652,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2846,9 +2669,6 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -2917,9 +2737,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -2938,9 +2755,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -3010,9 +2824,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -3036,9 +2847,6 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -3108,9 +2916,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX6-NEXT:    s_add_i32 s10, s10, s15
-; GFX6-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX6-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -3128,9 +2933,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
index ab00b132f2fd1a..fee6540f43c64c 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -6,13 +6,18 @@
 ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tonga | FileCheck -check-prefix=NOHSA-NOADDR64 -check-prefix=ALL %s
 
 
+; There are no stack objects even though flat is used by default, so
+; flat_scratch_init should be disabled.
+
 ; ALL-LABEL: {{^}}test:
 
+; ALL-NOT: flat_scr
+
 ; HSA-DEFAULT: flat_store_dword
 ; HSA-NODEFAULT: buffer_store_dword
 ; HSA-NOADDR64: flat_store_dword
 
-; HSA: .amdhsa_user_sgpr_flat_scratch_init 1
+; HSA: .amdhsa_user_sgpr_flat_scratch_init 0
 
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index 134b7902380862..e4ffedd686ac93 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefix=CI -check-prefix=GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefix=FIJI-NOXNACK -check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,GCN %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,GCN %s
@@ -8,16 +8,16 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,HSA-VI-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-XNACK,HSA-VI-XNACK,GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX9-ARCH-FLAT-NOXNACK,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX9-ARCH-FLAT-XNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX9-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX9-ARCH-FLAT,GCN %s
 
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX10-ARCH-FLAT-NOXNACK,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=GFX10-ARCH-FLAT-XNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX10-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX10-ARCH-FLAT,GCN %s
 
 ; GCN-LABEL: {{^}}no_vcc_no_flat:
 
@@ -28,12 +28,8 @@
 ; CI: ; NumSgprs: 8
 ; VI-NOXNACK: ; NumSgprs: 8
 ; VI-XNACK: ; NumSgprs: 12
-; HSA-VI-NOXNACK: ; NumSgprs: 8
-; HSA-VI-XNACK: ; NumSgprs: 12
-; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
-; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 8
-; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 8
+; GFX9-ARCH-FLAT: ; NumSgprs: 14
+; GFX10-ARCH-FLAT: ; NumSgprs: 8
 define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7}"()
@@ -49,12 +45,8 @@ entry:
 ; CI: ; NumSgprs: 10
 ; VI-NOXNACK: ; NumSgprs: 10
 ; VI-XNACK: ; NumSgprs: 12
-; HSA-VI-NOXNACK: ; NumSgprs: 10
-; HSA-VI-XNACK: ; NumSgprs: 12
-; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
-; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 10
-; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 10
+; GFX9-ARCH-FLAT: ; NumSgprs: 14
+; GFX10-ARCH-FLAT: ; NumSgprs: 10
 define amdgpu_kernel void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -68,15 +60,10 @@ entry:
 ; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
 
 ; CI: ; NumSgprs: 12
-; FIJI-NOXNACK: ; NumSgprs: 14
 ; VI-NOXNACK: ; NumSgprs: 14
 ; VI-XNACK: ; NumSgprs: 14
-; HSA-VI-NOXNACK: ; NumSgprs: 24
-; HSA-VI-XNACK: ; NumSgprs: 24
-; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
-; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 8
-; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 8
+; GFX9-ARCH-FLAT: ; NumSgprs: 14
+; GFX10-ARCH-FLAT: ; NumSgprs: 8
 define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -92,10 +79,8 @@ entry:
 ; CI: ; NumSgprs: 12
 ; VI-NOXNACK: ; NumSgprs: 14
 ; VI-XNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 14
-; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 14
-; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 10
-; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 10
+; GFX9-ARCH-FLAT: ; NumSgprs: 14
+; GFX10-ARCH-FLAT: ; NumSgprs: 10
 define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -114,10 +99,8 @@ entry:
 ; CI: NumSgprs: 4
 ; VI-NOXNACK: NumSgprs: 6
 ; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6
-; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6
-; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0
-; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0
+; GFX9-ARCH-FLAT: ; NumSgprs: 6
+; GFX10-ARCH-FLAT: ; NumSgprs: 0
 define amdgpu_kernel void @use_flat_scr() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -133,10 +116,8 @@ entry:
 ; CI: NumSgprs: 4
 ; VI-NOXNACK: NumSgprs: 6
 ; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6
-; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6
-; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0
-; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0
+; GFX9-ARCH-FLAT: ; NumSgprs: 6
+; GFX10-ARCH-FLAT: ; NumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_lo() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -152,10 +133,8 @@ entry:
 ; CI: NumSgprs: 4
 ; VI-NOXNACK: NumSgprs: 6
 ; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT-NOXNACK: ; NumSgprs: 6
-; GFX9-ARCH-FLAT-XNACK: ; NumSgprs: 6
-; GFX10-ARCH-FLAT-NOXNACK: ; NumSgprs: 0
-; GFX10-ARCH-FLAT-XNACK: ; NumSgprs: 0
+; GFX9-ARCH-FLAT: ; NumSgprs: 6
+; GFX10-ARCH-FLAT: ; NumSgprs: 0
 define amdgpu_kernel void @use_flat_scr_hi() #0 {
 entry:
   call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 23b453438f3a0f..c60b9858abd836 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -16,9 +16,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
 ; VI-LABEL: multiple_fadd_use_test_f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_add_f32_e64 v0, s3, -1.0
 ; VI-NEXT:    v_add_f32_e64 v1, s2, -1.0
@@ -85,11 +82,8 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x2c
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_add_u32 s2, s0, 4
 ; VI-NEXT:    v_add_f32_e64 v2, s4, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -149,9 +143,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo
 ; VI-LABEL: multiple_use_fadd_fmad_f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_add_u32 s4, s0, 4
@@ -209,9 +200,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s6, s4, 4
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
@@ -275,9 +263,6 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e64 v0, s2, -4.0
 ; VI-NEXT:    v_mul_f32_e32 v2, s2, v0
@@ -328,13 +313,10 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0xc0c00000
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f32_e32 v0, s2, v0
 ; VI-NEXT:    v_mul_f32_e32 v2, s2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
@@ -380,9 +362,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
 ; VI-DENORM:       ; %bb.0:
 ; VI-DENORM-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
-; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-DENORM-NEXT:    v_add_f16_e64 v0, s2, -1.0
@@ -401,9 +380,6 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
 ; VI-FLUSH:       ; %bb.0:
 ; VI-FLUSH-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
-; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-FLUSH-NEXT:    v_add_f16_e64 v0, s2, -1.0
@@ -522,9 +498,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
 ; VI-DENORM:       ; %bb.0:
 ; VI-DENORM-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
-; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s3
@@ -546,9 +519,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
 ; VI-FLUSH:       ; %bb.0:
 ; VI-FLUSH-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
-; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
@@ -649,9 +619,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
 ; VI-DENORM:       ; %bb.0:
 ; VI-DENORM-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
-; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s3
@@ -673,9 +640,6 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
 ; VI-FLUSH:       ; %bb.0:
 ; VI-FLUSH-NEXT:    s_load_dword s4, s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
-; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s3, s4, 16
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s3
@@ -778,8 +742,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; VI-DENORM-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; VI-DENORM-NEXT:    s_load_dword s6, s[6:7], 0x8
-; VI-DENORM-NEXT:    s_add_i32 s10, s10, s15
-; VI-DENORM-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-DENORM-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-DENORM-NEXT:    s_lshr_b32 s0, s0, 16
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s0
@@ -787,7 +749,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-DENORM-NEXT:    v_fma_f16 v3, |s6|, 2.0, v0
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v0, s2
-; VI-DENORM-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-DENORM-NEXT:    s_add_u32 s4, s2, 2
 ; VI-DENORM-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-DENORM-NEXT:    s_addc_u32 s5, s3, 0
@@ -804,8 +765,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; VI-FLUSH-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
 ; VI-FLUSH-NEXT:    s_load_dword s6, s[6:7], 0x8
-; VI-FLUSH-NEXT:    s_add_i32 s10, s10, s15
-; VI-FLUSH-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-FLUSH-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-FLUSH-NEXT:    s_lshr_b32 s0, s0, 16
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s0
@@ -813,7 +772,6 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-FLUSH-NEXT:    v_mad_f16 v3, |s6|, 2.0, v0
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v0, s2
-; VI-FLUSH-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-FLUSH-NEXT:    s_add_u32 s4, s2, 2
 ; VI-FLUSH-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-FLUSH-NEXT:    s_addc_u32 s5, s3, 0
@@ -917,9 +875,6 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e64 v0, s2, -4.0
 ; VI-NEXT:    v_mul_f16_e32 v2, s2, v0
@@ -973,13 +928,10 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, 0xc600
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mul_f16_e32 v0, s2, v0
 ; VI-NEXT:    v_mul_f16_e32 v2, s2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_short v[0:1], v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index c4f13749251a4c..8267bb9f5450f8 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -8,9 +8,6 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
 ; CI-LABEL: fneg_fabs_fadd_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e64 v0, |s0|
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -28,9 +25,6 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -77,9 +71,6 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
 ; CI-LABEL: fneg_fabs_fmul_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s1, s0, 0x7fff
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -98,9 +89,6 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
@@ -151,9 +139,6 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_bitset1_b32 s2, 15
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -166,9 +151,6 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitset1_b32 s2, 15
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -213,9 +195,6 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_bitset1_b32 s2, 15
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -228,9 +207,6 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitset1_b32 s2, 15
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -273,9 +249,6 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(
 ; CIVI-LABEL: v_fneg_fabs_f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -321,9 +294,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
 ; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s1, s0, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, s1
@@ -346,9 +316,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x4000
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v2, s3
@@ -357,7 +325,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_or_b32_e32 v2, 0x80008000, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -402,9 +369,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_or_b32 s2, s2, 0x80008000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -417,9 +381,6 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_or_b32 s2, s2, 0x80008000
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -462,9 +423,6 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
 ; CIVI-LABEL: fneg_fabs_v4f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_or_b32 s3, s3, 0x80008000
 ; CIVI-NEXT:    s_or_b32 s2, s2, 0x80008000
@@ -510,9 +468,6 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
 ; CI-LABEL: fold_user_fneg_fabs_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s1, s0, 16
 ; CI-NEXT:    v_cvt_f32_f16_e64 v1, |s1|
@@ -534,9 +489,7 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v0, 0xc400
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v2, s3
@@ -544,7 +497,6 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
 ; VI-NEXT:    v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v2, v1, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -586,9 +538,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    s_and_b32 s0, s4, 0x7fff7fff
@@ -606,9 +555,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_and_b32 s0, s4, 0x7fff7fff
@@ -665,9 +611,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    s_bfe_u32 s0, s4, 0xf0010
@@ -692,9 +635,7 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
 ; VI-NEXT:    v_mov_b32_e32 v5, 0xc400
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_lshr_b32 s1, s4, 16
@@ -703,7 +644,6 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
 ; VI-NEXT:    s_and_b32 s0, s4, 0x7fff7fff
 ; VI-NEXT:    v_mul_f16_sdwa v4, |v4|, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NEXT:    v_mul_f16_e64 v5, |s4|, -4.0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; VI-NEXT:    v_mov_b32_e32 v5, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index e9362ec9b5322f..63ccaafeda88f4 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1478,8 +1478,6 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x4
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x6
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_bitcmp1_b32 s8, 0
 ; GFX7-NEXT:    s_cselect_b64 vcc, -1, 0
@@ -1491,7 +1489,6 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
 ; GFX7-NEXT:    s_cselect_b32 s0, s0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s4
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, -v0, vcc
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s5
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index 6e22c9c319f698..40982347f3ca00 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -10,9 +10,6 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -25,9 +22,6 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -72,9 +66,6 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -89,9 +80,6 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -141,9 +129,6 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -156,9 +141,6 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x8000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -201,9 +183,6 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
 ; CI-LABEL: v_fneg_fold_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -221,9 +200,6 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8-LABEL: v_fneg_fold_f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -270,9 +246,6 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -285,9 +258,6 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -329,17 +299,14 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 {
 ; CIVI-LABEL: s_fneg_v2f16_nonload:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
 ; CIVI-NEXT:    ;;#ASMSTART
 ; CIVI-NEXT:    ; def s2
 ; CIVI-NEXT:    ;;#ASMEND
 ; CIVI-NEXT:    s_xor_b32 s2, s2, 0x80008000
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CIVI-NEXT:    v_mov_b32_e32 v2, s2
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
-; CIVI-NEXT:    v_mov_b32_e32 v2, s2
 ; CIVI-NEXT:    flat_store_dword v[0:1], v2
 ; CIVI-NEXT:    s_endpgm
 ;
@@ -382,9 +349,6 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
@@ -399,9 +363,6 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -451,9 +412,6 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -466,9 +424,6 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -511,9 +466,6 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; CI-LABEL: v_fneg_fold_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -540,9 +492,6 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
 ; GFX8-LABEL: v_fneg_fold_v2f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -590,9 +539,6 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; CI-LABEL: v_extract_fneg_fold_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -614,9 +560,6 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
 ; GFX8-LABEL: v_extract_fneg_fold_v2f16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
@@ -681,9 +624,6 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0
 ; CIVI-LABEL: v_extract_fneg_no_fold_v2f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index 6207c442f41eeb..3735c6349fbb31 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -10,9 +10,6 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -24,9 +21,6 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -54,9 +48,6 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -68,9 +59,6 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -97,9 +85,6 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
 ; CIVI-LABEL: load_v3f16_arg:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_add_u32 s4, s0, 4
 ; CIVI-NEXT:    s_addc_u32 s5, s1, 0
@@ -135,9 +120,6 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
 ; CIVI-LABEL: load_v4f16_arg:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v2, s2
@@ -165,9 +147,6 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v4, s4
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -182,9 +161,6 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v4, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -217,9 +193,6 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s3, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -233,9 +206,6 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -269,9 +239,6 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, s2
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -283,9 +250,6 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -315,9 +279,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s3, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -331,9 +292,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
@@ -366,9 +324,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
 ; CI-LABEL: extload_v3f16_to_v3f32_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, s3
@@ -382,9 +337,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
 ; VI-LABEL: extload_v3f16_to_v3f32_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s2, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s2
@@ -417,9 +369,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ; CI-LABEL: extload_v4f16_to_v4f32_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s3, 16
 ; CI-NEXT:    s_lshr_b32 s5, s2, 16
@@ -435,9 +384,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
 ; VI-LABEL: extload_v4f16_to_v4f32_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s3, 16
 ; VI-NEXT:    s_lshr_b32 s5, s2, 16
@@ -475,9 +421,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s6, s1, 16
 ; CI-NEXT:    s_lshr_b32 s7, s0, 16
@@ -506,9 +449,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s6, s1, 16
 ; VI-NEXT:    s_lshr_b32 s7, s0, 16
@@ -567,9 +507,6 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
 ; CI-LABEL: extload_f16_to_f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -583,9 +520,6 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
 ; VI-LABEL: extload_f16_to_f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s0, s[6:7], 0x8
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -619,9 +553,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
 ; CI-LABEL: extload_v2f16_to_v2f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s1, s0, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s1
@@ -638,9 +569,6 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
 ; VI-LABEL: extload_v2f16_to_v2f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s0, s[6:7], 0x8
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s1, s0, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s1
@@ -680,9 +608,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
 ; CI-LABEL: extload_v3f16_to_v3f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s3
 ; CI-NEXT:    s_lshr_b32 s4, s2, 16
@@ -704,9 +629,6 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
 ; VI-LABEL: extload_v3f16_to_v3f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, s3
 ; VI-NEXT:    s_lshr_b32 s4, s2, 16
@@ -754,9 +676,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
 ; CI-LABEL: extload_v4f16_to_v4f64_arg:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s3, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s3
@@ -782,9 +701,6 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
 ; VI-LABEL: extload_v4f16_to_v4f64_arg:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s5, s3, 16
 ; VI-NEXT:    v_cvt_f32_f16_e32 v0, s3
@@ -840,9 +756,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s6, s3, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s6
@@ -890,9 +803,6 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s6, s0, 16
 ; VI-NEXT:    s_lshr_b32 s8, s2, 16
@@ -980,9 +890,6 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr
 ; CIVI-LABEL: global_load_store_f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1013,9 +920,6 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: global_load_store_v2f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1046,9 +950,6 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add
 ; CIVI-LABEL: global_load_store_v4f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
@@ -1079,9 +980,6 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: global_load_store_v8f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1112,9 +1010,6 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr
 ; CIVI-LABEL: global_extload_f16_to_f32:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1148,9 +1043,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v2f16_to_v2f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1167,9 +1059,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v2f16_to_v2f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1207,9 +1096,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v3f16_to_v3f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1227,9 +1113,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v3f16_to_v3f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1269,9 +1152,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v4f16_to_v4f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1291,9 +1171,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v4f16_to_v4f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1336,9 +1213,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v8f16_to_v8f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1369,9 +1243,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v8f16_to_v8f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1430,9 +1301,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
 ; CI-LABEL: global_extload_v16f16_to_v16f32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s4, s2, 16
 ; CI-NEXT:    s_addc_u32 s5, s3, 0
@@ -1491,9 +1359,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
 ; VI-LABEL: global_extload_v16f16_to_v16f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1593,9 +1458,6 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr
 ; CIVI-LABEL: global_extload_f16_to_f64:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1632,9 +1494,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v2f16_to_v2f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1653,9 +1512,6 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v2f16_to_v2f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1698,9 +1554,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v3f16_to_v3f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1726,9 +1579,6 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v3f16_to_v3f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1782,9 +1632,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v4f16_to_v4f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1813,9 +1660,6 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v4f16_to_v4f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1875,9 +1719,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
 ; CI-LABEL: global_extload_v8f16_to_v8f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -1926,9 +1767,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
 ; VI-LABEL: global_extload_v8f16_to_v8f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2015,9 +1853,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
 ; CI-LABEL: global_extload_v16f16_to_v16f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2113,9 +1948,6 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
 ; VI-LABEL: global_extload_v16f16_to_v16f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2271,9 +2103,6 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p
 ; CIVI-LABEL: global_truncstore_f32_to_f16:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2307,9 +2136,6 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v2f32_to_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2327,9 +2153,6 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v2f32_to_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2368,9 +2191,6 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v3f32_to_v3f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2394,9 +2214,6 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v3f32_to_v3f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2444,9 +2261,6 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v4f32_to_v4f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2468,9 +2282,6 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v4f32_to_v4f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2515,9 +2326,6 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
 ; CI-LABEL: global_truncstore_v8f32_to_v8f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2553,9 +2361,6 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
 ; VI-LABEL: global_truncstore_v8f32_to_v8f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2621,9 +2426,6 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
 ; CI-LABEL: global_truncstore_v16f32_to_v16f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_add_u32 s4, s2, 32
 ; CI-NEXT:    s_addc_u32 s5, s3, 0
@@ -2693,9 +2495,6 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
 ; VI-LABEL: global_truncstore_v16f32_to_v16f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_add_u32 s4, s2, 32
 ; VI-NEXT:    s_addc_u32 s5, s3, 0
@@ -2808,9 +2607,6 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
 ; CI-LABEL: fadd_f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s0, s[6:7], 0x2
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s0
 ; CI-NEXT:    s_lshr_b32 s0, s0, 16
@@ -2828,9 +2624,6 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s3, s2, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
@@ -2863,9 +2656,6 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
 ; CI-LABEL: fadd_v2f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_lshr_b32 s4, s2, 16
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
@@ -2887,9 +2677,6 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
 ; VI-LABEL: fadd_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s3, 16
 ; VI-NEXT:    s_lshr_b32 s5, s2, 16
@@ -2923,9 +2710,6 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-LABEL: fadd_v4f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s2
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
@@ -2963,9 +2747,6 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-LABEL: fadd_v4f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
@@ -3006,9 +2787,6 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x half> %b) #0 {
 ; CI-LABEL: fadd_v8f16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3067,9 +2845,6 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x
 ;
 ; VI-LABEL: fadd_v8f16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3134,9 +2909,6 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr
 ; CIVI-LABEL: test_bitcast_from_half:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s1
@@ -3168,9 +2940,6 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs
 ; CIVI-LABEL: test_bitcast_to_half:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s2
 ; CIVI-NEXT:    v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 51b0e2b86cdf39..8c017fa5ec2636 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -15,10 +15,7 @@
 ; CHECK:     .max_flat_workgroup_size: 1024
 ; CHECK:     .name:           test
 ; CHECK:     .private_segment_fixed_size: 0
-; GFX700:     .sgpr_count:     22
-; GFX803:     .sgpr_count:     24
-; GFX900:     .sgpr_count:     10
-; GFX1010:    .sgpr_count:     10
+; CHECK:     .sgpr_count:     10
 ; CHECK:     .symbol:         test.kd
 ; CHECK:     .vgpr_count:     {{3|6}}
 ; WAVE64:    .wavefront_size: 64
@@ -51,8 +48,8 @@ entry:
 
 ; CHECK:   .name:       num_spilled_sgprs
 ; GFX700:   .sgpr_spill_count: 10
-; GFX803:   .sgpr_spill_count: 0
-; GFX900:   .sgpr_spill_count: 0
+; GFX803:   .sgpr_spill_count: 10
+; GFX900:   .sgpr_spill_count: 62
 ; GFX1010:  .sgpr_spill_count: 60
 ; CHECK:   .symbol:     num_spilled_sgprs.kd
 define amdgpu_kernel void @num_spilled_sgprs(
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 2c38e201d326f6..37476203fbfad6 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -43,7 +43,7 @@
 ; ELF:   00E0: 6E616D65 A673696D 706C65BB 2E707269
 ; ELF:   00F0: 76617465 5F736567 6D656E74 5F666978
 ; ELF:   0100: 65645F73 697A6500 AB2E7367 70725F63
-; ELF:   0110: 6F756E74 0EB12E73 6770725F 7370696C
+; ELF:   0110: 6F756E74 06B12E73 6770725F 7370696C
 ; ELF:   0120: 6C5F636F 756E7400 A72E7379 6D626F6C
 ; ELF:   0130: A973696D 706C652E 6B64AB2E 76677072
 ; ELF:   0140: 5F636F75 6E7403B1 2E766770 725F7370
@@ -59,7 +59,7 @@
 ; ELF:   01E0: 73696D70 6C655F6E 6F5F6B65 726E6172
 ; ELF:   01F0: 6773BB2E 70726976 6174655F 7365676D
 ; ELF:   0200: 656E745F 66697865 645F7369 7A6500AB
-; ELF:   0210: 2E736770 725F636F 756E740C B12E7367
+; ELF:   0210: 2E736770 725F636F 756E7400 B12E7367
 ; ELF:   0220: 70725F73 70696C6C 5F636F75 6E7400A7
 ; ELF:   0230: 2E73796D 626F6CB5 73696D70 6C655F6E
 ; ELF:   0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 8f9b223c361d60..1b12f668e207cd 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -12,9 +12,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
 ; GFX8V4-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x40
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V4-NEXT:    s_cselect_b32 s3, s3, 0
@@ -24,7 +22,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8V4-NEXT:    s_cselect_b32 s0, s2, 0
 ; GFX8V4-NEXT:    s_cselect_b32 s1, s1, 0
-; GFX8V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8V4-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8V4-NEXT:    flat_store_dword v[0:1], v4
@@ -38,9 +35,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8V5-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0xc8
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX8V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX8V5-NEXT:    s_cselect_b32 s2, s2, 0
@@ -50,7 +45,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX8V5-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8V5-NEXT:    s_cselect_b32 s0, s3, 0
 ; GFX8V5-NEXT:    s_cselect_b32 s1, s1, 0
-; GFX8V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8V5-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8V5-NEXT:    flat_store_dword v[0:1], v4
@@ -63,10 +57,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-LABEL: addrspacecast:
 ; GFX9V4:       ; %bb.0:
 ; GFX9V4-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9V4-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
-; GFX9V4-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GFX9V4-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V4-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V4-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX9V4-NEXT:    s_cselect_b32 s2, s3, 0
@@ -76,7 +69,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V4-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9V4-NEXT:    s_cselect_b32 s0, s5, 0
 ; GFX9V4-NEXT:    s_cselect_b32 s1, s1, 0
-; GFX9V4-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9V4-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V4-NEXT:    flat_store_dword v[0:1], v4
@@ -89,10 +81,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-LABEL: addrspacecast:
 ; GFX9V5:       ; %bb.0:
 ; GFX9V5-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9V5-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX9V5-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX9V5-NEXT:    s_mov_b64 s[2:3], src_private_base
 ; GFX9V5-NEXT:    s_mov_b64 s[4:5], src_shared_base
+; GFX9V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9V5-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX9V5-NEXT:    s_cselect_b32 s2, s3, 0
@@ -102,7 +93,6 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
 ; GFX9V5-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9V5-NEXT:    s_cselect_b32 s0, s5, 0
 ; GFX9V5-NEXT:    s_cselect_b32 s1, s1, 0
-; GFX9V5-NEXT:    v_mov_b32_e32 v4, 1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9V5-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9V5-NEXT:    flat_store_dword v[0:1], v4
@@ -124,9 +114,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x40
 ; GFX8V4-NEXT:    s_load_dword s1, s[8:9], 0x4
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -139,9 +126,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xcc
 ; GFX8V5-NEXT:    s_load_dword s1, s[6:7], 0x4
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -184,9 +168,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V4:       ; %bb.0:
 ; GFX8V4-NEXT:    s_load_dword s0, s[6:7], 0x44
 ; GFX8V4-NEXT:    s_load_dword s1, s[8:9], 0x4
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
 ; GFX8V4-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V4-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V4-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -199,9 +180,6 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
 ; GFX8V5:       ; %bb.0:
 ; GFX8V5-NEXT:    s_load_dword s0, s[6:7], 0xc8
 ; GFX8V5-NEXT:    s_load_dword s1, s[6:7], 0x4
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX8V5-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8V5-NEXT:    s_cmp_eq_u32 s1, s0
 ; GFX8V5-NEXT:    s_cselect_b64 s[0:1], -1, 0
@@ -285,10 +263,7 @@ define amdgpu_kernel void @llvm_debugtrap() {
 define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V4:       ; %bb.0:
-; GFX8V4-NEXT:    s_add_i32 s12, s12, s17
 ; GFX8V4-NEXT:    v_mov_b32_e32 v0, s6
-; GFX8V4-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; GFX8V4-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; GFX8V4-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8V4-NEXT:    s_add_u32 s0, s8, 8
 ; GFX8V4-NEXT:    flat_load_ubyte v0, v[0:1] glc
@@ -313,9 +288,6 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr)  {
 ;
 ; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
 ; GFX8V5:       ; %bb.0:
-; GFX8V5-NEXT:    s_add_i32 s10, s10, s15
-; GFX8V5-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8V5-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8V5-NEXT:    s_add_u32 s0, s6, 8
 ; GFX8V5-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GFX8V5-NEXT:    s_addc_u32 s1, s7, 0
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index ff5c71b65f34bd..4d62d30a38ed34 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,15 +8,15 @@
 define amdgpu_kernel void @s_input_output_i128() {
   ; GFX908-LABEL: name: s_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %12
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %12
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %11
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
   ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: s_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %10
-  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %10
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7340042 /* regdef:SGPR_128 */, def %9
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:sgpr_128 = COPY %9
   ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7340041 /* reguse:SGPR_128 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=s"()
@@ -27,15 +27,15 @@ define amdgpu_kernel void @s_input_output_i128() {
 define amdgpu_kernel void @v_input_output_i128() {
   ; GFX908-LABEL: name: v_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %12
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %12
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %11
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY %11
   ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:VReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: v_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %10
-  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %10
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %9
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %9
   ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6553609 /* reguse:VReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = tail call i128 asm sideeffect "; def $0", "=v"()
@@ -46,15 +46,15 @@ define amdgpu_kernel void @v_input_output_i128() {
 define amdgpu_kernel void @a_input_output_i128() {
   ; GFX908-LABEL: name: a_input_output_i128
   ; GFX908: bb.0 (%ir-block.0):
-  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %12
-  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %12
+  ; GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6160394 /* regdef:AReg_128 */, def %11
+  ; GFX908-NEXT:   [[COPY:%[0-9]+]]:areg_128 = COPY %11
   ; GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6160393 /* reguse:AReg_128 */, [[COPY]]
   ; GFX908-NEXT:   S_ENDPGM 0
   ;
   ; GFX90A-LABEL: name: a_input_output_i128
   ; GFX90A: bb.0 (%ir-block.0):
-  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %10
-  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %10
+  ; GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6422538 /* regdef:AReg_128_Align2 */, def %9
+  ; GFX90A-NEXT:   [[COPY:%[0-9]+]]:areg_128_align2 = COPY %9
   ; GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6422537 /* reguse:AReg_128_Align2 */, [[COPY]]
   ; GFX90A-NEXT:   S_ENDPGM 0
   %val = call i128 asm sideeffect "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index 8a5c81fa8aa432..3135addec16183 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -22,9 +22,6 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
 ; VI-LABEL: s_insertelement_v2bf16_0:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -85,9 +82,6 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
 ; VI-LABEL: s_insertelement_v2bf16_1:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -150,9 +144,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -225,9 +216,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -298,9 +286,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -373,9 +358,6 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -453,14 +435,11 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1)
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -551,17 +530,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, s4, v0, v4
@@ -633,17 +609,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, v0, s4, v4
@@ -713,17 +686,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, s4, v1, v4
@@ -795,17 +765,14 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, v1, s4, v4
@@ -881,12 +848,9 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1)
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -978,12 +942,9 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
@@ -1105,12 +1066,9 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
@@ -1287,14 +1245,11 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -1462,14 +1417,11 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index aafa4a04a00de9..647870f0e08979 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -20,9 +20,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2i16_0:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -72,9 +69,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -91,9 +85,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -149,9 +140,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -172,9 +160,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -237,9 +222,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -255,9 +237,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -314,9 +293,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -336,9 +312,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -409,9 +382,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -434,9 +404,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -508,9 +475,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2i16_1:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -559,9 +523,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -578,9 +539,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -631,9 +589,6 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2f16_0:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -681,9 +636,6 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
 ; CIVI-LABEL: s_insertelement_v2f16_1:
 ; CIVI:       ; %bb.0:
 ; CIVI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CIVI-NEXT:    s_add_i32 s10, s10, s15
-; CIVI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CIVI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CIVI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CIVI-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; CIVI-NEXT:    v_mov_b32_e32 v0, s0
@@ -732,9 +684,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -753,9 +702,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -814,12 +760,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
@@ -836,12 +779,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dword v3, v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
@@ -898,9 +838,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -919,9 +856,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -979,9 +913,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1000,9 +931,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1059,9 +987,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1080,9 +1005,6 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1139,9 +1061,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1160,9 +1079,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1219,9 +1135,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1240,9 +1153,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1299,9 +1209,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1320,9 +1227,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1379,9 +1283,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
@@ -1400,9 +1301,6 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
@@ -1465,9 +1363,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1487,9 +1382,6 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1554,12 +1446,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -1578,12 +1467,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dword v3, v[0:1]
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s0, v2
@@ -1649,14 +1535,11 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_load_dword v4, v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1679,14 +1562,11 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
+; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    v_mov_b32_e32 v1, s5
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    v_mov_b32_e32 v3, s3
 ; CI-NEXT:    flat_load_dword v4, v[0:1]
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1757,17 +1637,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, s4, v0, v4
@@ -1779,12 +1656,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1844,17 +1718,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v0, v0, s4, v4
@@ -1866,12 +1737,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1932,17 +1800,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x30
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, s4, v1, v4
@@ -1954,12 +1819,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0xc
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2019,17 +1881,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x1000504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, v1, s4, v4
@@ -2041,12 +1900,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2107,17 +1963,14 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    v_mov_b32_e32 v4, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_perm_b32 v1, s4, v1, v4
@@ -2129,12 +1982,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2200,9 +2050,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    flat_load_dword v4, v[0:1] glc
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
@@ -2230,9 +2077,6 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    flat_load_dword v4, v[0:1] glc
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
@@ -2316,12 +2160,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2345,12 +2186,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -2420,12 +2258,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s0, v4
@@ -2443,12 +2278,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2509,12 +2341,9 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2532,12 +2361,9 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2629,12 +2455,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2686,12 +2509,9 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    v_mov_b32_e32 v5, s1
@@ -2836,14 +2656,11 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -2866,12 +2683,9 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s3
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, s2, v8
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
@@ -2947,14 +2761,12 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v12, 0x3020504
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -2962,7 +2774,6 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
 ; VI-NEXT:    v_add_u32_e32 v8, vcc, s0, v8
 ; VI-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
 ; VI-NEXT:    v_add_u32_e32 v10, vcc, 16, v8
-; VI-NEXT:    v_mov_b32_e32 v12, 0x3020504
 ; VI-NEXT:    v_addc_u32_e32 v11, vcc, 0, v9, vcc
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_perm_b32 v3, s4, v3, v12
@@ -2976,14 +2787,11 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s4, s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v4, vcc, 16, v0
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; CI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -3120,14 +2928,11 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s3
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v8
 ; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v0, vcc
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, 16, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
 ; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -3220,14 +3025,11 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 5, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, 16, v0
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx4 v[7:10], v[2:3]
 ; CI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
index 1a32953305bbc3..496a1c652da251 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
@@ -7,7 +7,7 @@ declare void @llvm.trap() #0
 ; DOORBELL-NEXT:     .amdhsa_group_segment_fixed_size 0
 ; DOORBELL-NEXT:     .amdhsa_private_segment_fixed_size 0
 ; DOORBELL-NEXT:     .amdhsa_kernarg_size 8
-; DOORBELL-NEXT:     .amdhsa_user_sgpr_count 14
+; DOORBELL-NEXT:     .amdhsa_user_sgpr_count 12
 ; DOORBELL-NEXT:     .amdhsa_user_sgpr_private_segment_buffer 1
 ; DOORBELL:      .end_amdhsa_kernel
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index c998a007277933..c201f84cac7268 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -23,11 +23,8 @@ define void @function_lds_id(ptr addrspace(1) %out) {
 define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
 ; GCN-LABEL: kernel_lds_id:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s2, s12, 42
+; GCN-NEXT:    s_add_i32 s2, s10, 42
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
@@ -75,9 +72,6 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
 define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
 ; GCN-LABEL: doesnt_use_it:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x64
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index d464269497b471..b061d53de5d3c5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -152,9 +152,6 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
 ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -165,9 +162,6 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
 ; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -183,13 +177,10 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
 ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -198,13 +189,10 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 32
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -217,13 +205,10 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
 ; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -232,14 +217,11 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -252,15 +234,12 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
 ; CHECK-SDAG-LABEL: test_readfirstlane_m0:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 m0, -1
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -268,15 +247,12 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) #1 {
 ; CHECK-GISEL-LABEL: test_readfirstlane_m0:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 m0, -1
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -290,31 +266,25 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1
 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 s2, 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -328,16 +298,13 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -345,16 +312,13 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -368,16 +332,13 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
 ; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -385,16 +346,13 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
 ; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index ffceac2c912bb6..24a332fa211c15 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -179,9 +179,6 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -192,9 +189,6 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
 ; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, 32
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -210,13 +204,10 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -225,13 +216,10 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 32
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -244,13 +232,10 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
 ; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -259,14 +244,11 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -280,9 +262,6 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -302,9 +281,6 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -335,9 +311,6 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -359,9 +332,6 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -395,9 +365,6 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -419,9 +386,6 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CHECK-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -455,15 +419,12 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
 ; CHECK-SDAG-LABEL: test_readlane_m0_sreg:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 m0, -1
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -471,15 +432,12 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
 ; CHECK-GISEL-LABEL: test_readlane_m0_sreg:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 m0, -1
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, m0
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -496,14 +454,11 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; def v0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v0, 32
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
@@ -513,13 +468,10 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    ; def v0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -533,17 +485,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; def v[0:1]
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v1, 32
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s3, v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -556,13 +505,10 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
 ; CHECK-GISEL-NEXT:    ; def v[0:1]
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, 32
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -577,17 +523,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
 ; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    ; def v[0:1]
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s2, v1, 32
 ; CHECK-SDAG-NEXT:    v_readlane_b32 s3, v0, 32
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s3
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -600,13 +543,10 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
 ; CHECK-GISEL-NEXT:    ; def v[0:1]
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    v_readlane_b32 s3, v1, 32
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -621,31 +561,25 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou
 ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b32 s2, 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-SDAG-NEXT:    s_endpgm
 ;
 ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b32 s2, 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; CHECK-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -659,16 +593,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
 ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -676,16 +607,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
 ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
@@ -699,16 +627,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
 ; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64:
 ; CHECK-SDAG:       ; %bb.0:
 ; CHECK-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; CHECK-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CHECK-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CHECK-SDAG-NEXT:    ;;#ASMSTART
 ; CHECK-SDAG-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-SDAG-NEXT:    ;;#ASMEND
-; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-SDAG-NEXT:    s_endpgm
@@ -716,16 +641,13 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
 ; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64:
 ; CHECK-GISEL:       ; %bb.0:
 ; CHECK-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CHECK-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; CHECK-GISEL-NEXT:    ;;#ASMSTART
 ; CHECK-GISEL-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-GISEL-NEXT:    ;;#ASMEND
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CHECK-GISEL-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; CHECK-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index da40a06c306b98..9d93ca65683c42 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -15,9 +15,6 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
 ; GFX802-SDAG-LABEL: test_writelane_sreg_i32:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -58,9 +55,6 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
 ; GFX802-GISEL-LABEL: test_writelane_sreg_i32:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -108,9 +102,6 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s6, s[6:7], 0x10
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
@@ -162,9 +153,6 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s6, s[6:7], 0x10
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
@@ -222,9 +210,6 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s6, s[6:7], 0x10
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s6
@@ -276,9 +261,6 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s6, s[6:7], 0x10
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s6
@@ -336,9 +318,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s2, s[6:7], 0x8
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -383,9 +362,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s2, s[6:7], 0x8
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -436,9 +412,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x8
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
@@ -489,9 +462,6 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x8
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -548,14 +518,11 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x8
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
@@ -606,14 +573,11 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x8
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s4
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
@@ -669,9 +633,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -733,9 +694,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -808,9 +766,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -878,9 +833,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -957,9 +909,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX802-SDAG-NEXT:    s_mov_b32 s4, 0x40280000
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -968,7 +918,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-SDAG-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX802-SDAG-NEXT:    flat_load_dword v2, v[0:1]
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX802-SDAG-NEXT:    s_mov_b32 s4, 0x40280000
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
@@ -1031,9 +980,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 4, v0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX802-GISEL-NEXT:    s_mov_b32 s4, 0x40280000
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -1043,7 +990,6 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX802-GISEL-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
-; GFX802-GISEL-NEXT:    s_mov_b32 s4, 0x40280000
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1118,18 +1064,15 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
 ; GFX802-SDAG-NEXT:    ;;#ASMSTART
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, -1
 ; GFX802-SDAG-NEXT:    ;;#ASMEND
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    s_mov_b32 s4, m0
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s2
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v2, s4, m0
-; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
@@ -1176,18 +1119,15 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
 ; GFX802-GISEL-NEXT:    ;;#ASMSTART
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, -1
 ; GFX802-GISEL-NEXT:    ;;#ASMEND
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    s_mov_b32 s4, m0
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s2
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v2, s4, m0
-; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
@@ -1238,9 +1178,6 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s2, s[6:7], 0x8
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -1285,9 +1222,6 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s2, s[6:7], 0x8
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1337,9 +1271,6 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
 ; GFX802-SDAG-LABEL: test_writelane_imm_i64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
@@ -1385,9 +1316,6 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
 ; GFX802-GISEL-LABEL: test_writelane_imm_i64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1439,9 +1367,6 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
 ; GFX802-SDAG-LABEL: test_writelane_imm_f64:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
@@ -1487,9 +1412,6 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
 ; GFX802-GISEL-LABEL: test_writelane_imm_f64:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1542,9 +1464,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
@@ -1584,9 +1503,6 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
@@ -1632,13 +1548,10 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
@@ -1683,14 +1596,11 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s8
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s3
@@ -1739,13 +1649,10 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v3, s5, m0
@@ -1790,14 +1697,11 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-GISEL-NEXT:    s_load_dword s8, s[6:7], 0x18
 ; GFX802-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s8
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s5, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s3
@@ -1844,10 +1748,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
 ; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32:
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, 42
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-SDAG-NEXT:    s_mov_b32 m0, s3
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, s0
@@ -1881,10 +1782,7 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
 ; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32:
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, 42
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s3
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1924,14 +1822,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, 42
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
@@ -1972,14 +1867,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s4
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2025,14 +1917,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
 ; GFX802-SDAG:       ; %bb.0:
 ; GFX802-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX802-SDAG-NEXT:    s_load_dword s4, s[6:7], 0x10
-; GFX802-SDAG-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX802-SDAG-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX802-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
-; GFX802-SDAG-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT:    s_mov_b32 m0, s4
 ; GFX802-SDAG-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
@@ -2073,14 +1962,11 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
 ; GFX802-GISEL:       ; %bb.0:
 ; GFX802-GISEL-NEXT:    s_load_dword s4, s[6:7], 0x10
 ; GFX802-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX802-GISEL-NEXT:    s_add_i32 s10, s10, s15
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX802-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX802-GISEL-NEXT:    s_mov_b32 m0, s4
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v3, s1
-; GFX802-GISEL-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX802-GISEL-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
 ; GFX802-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
 ; GFX802-GISEL-NEXT:    v_mov_b32_e32 v2, s0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index 59cc6dfac12005..7202ab8b314669 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -22,9 +22,6 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
 ; GFX7-HSA-LABEL: constant_load_f64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -96,9 +93,6 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu
 ; GFX7-HSA-LABEL: constant_load_2v4f64:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[20:21], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[16:17], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 8d833c40a44ee3..22b718935738bd 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -26,9 +26,6 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: constant_load_i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -111,9 +108,6 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v2i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -187,9 +181,6 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v3i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 4
@@ -296,9 +287,6 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v4i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -375,9 +363,6 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
 ; GCN-HSA-LABEL: constant_load_v8i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -464,9 +449,6 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GCN-HSA-LABEL: constant_load_v16i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -609,9 +591,6 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
 ; GCN-HSA-LABEL: constant_load_v16i16_align2:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
@@ -825,9 +804,6 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_zextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -905,9 +881,6 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_sextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -986,9 +959,6 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1066,9 +1036,6 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1145,9 +1112,6 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1235,9 +1199,6 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1329,9 +1290,6 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1431,9 +1389,6 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1538,9 +1493,6 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1649,9 +1601,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1771,9 +1720,6 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1934,9 +1880,6 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2116,9 +2059,6 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2383,9 +2323,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2694,9 +2631,6 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3177,9 +3111,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3750,9 +3681,6 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4676,9 +4604,6 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5467,9 +5392,6 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_zextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5558,9 +5480,6 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
 ; GCN-HSA-LABEL: constant_sextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5648,9 +5567,6 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5734,9 +5650,6 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5823,15 +5736,12 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s0, s2, 16
 ; GCN-HSA-NEXT:    s_and_b32 s1, s2, 0xffff
@@ -5922,9 +5832,6 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -6030,13 +5937,10 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s4, s3, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s5, s2, 16
@@ -6169,9 +6073,6 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6330,13 +6231,10 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s8, s5, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s2, s7, 16
@@ -6549,9 +6447,6 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6815,13 +6710,10 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s12, s5, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s13, s7, 16
@@ -7198,9 +7090,6 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx8 s[8:15], s[2:3], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7679,13 +7568,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_lshr_b32 s19, s1, 16
 ; GCN-HSA-NEXT:    s_lshr_b32 s20, s3, 16
@@ -8395,9 +8281,6 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index cf0cecde326e1a..f1a6bccc559f04 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -23,9 +23,6 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
 ; GFX7-HSA-LABEL: constant_load_i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -108,9 +105,6 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v2i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -200,9 +194,6 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v3i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -299,9 +290,6 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v4i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -402,9 +390,6 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX7-HSA-LABEL: constant_load_v8i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -541,9 +526,6 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX7-HSA-LABEL: constant_load_v9i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s12, s[10:11], 0x8
@@ -707,9 +689,6 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX7-HSA-LABEL: constant_load_v10i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[12:13], s[10:11], 0x8
@@ -882,9 +861,6 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
 ; GFX7-HSA-LABEL: constant_load_v11i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[12:15], s[2:3], 0x8
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
@@ -1062,9 +1038,6 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX7-HSA-LABEL: constant_load_v12i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[12:15], s[10:11], 0x8
@@ -1248,9 +1221,6 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
 ; GFX7-HSA-LABEL: constant_load_v16i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s18, s16, 48
@@ -1439,9 +1409,6 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
 ; GFX7-HSA-LABEL: constant_zextload_i32_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1528,9 +1495,6 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
 ; GFX7-HSA-LABEL: constant_sextload_i32_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1623,9 +1587,6 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
@@ -1712,9 +1673,6 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -1809,15 +1767,12 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s1
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s3
@@ -1912,9 +1867,6 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -2031,16 +1983,13 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GFX7-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s6
@@ -2169,9 +2118,6 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2336,10 +2282,8 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s2, s0, 48
@@ -2347,7 +2291,6 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX7-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s10
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v2, s11
@@ -2548,9 +2491,6 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -2849,9 +2789,6 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3308,16 +3245,13 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s18, s16, 0x70
 ; GFX7-HSA-NEXT:    s_addc_u32 s19, s17, 0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s18
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s19
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s14
@@ -3747,9 +3681,6 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x10
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4607,17 +4538,14 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
 ; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[36:39], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
 ; GFX7-HSA-NEXT:    s_add_u32 s34, s36, 0xf0
 ; GFX7-HSA-NEXT:    s_addc_u32 s35, s37, 0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s34
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v5, s35
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s30
@@ -5231,9 +5159,6 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
 ; GFX7-HSA-LABEL: constant_load_v32i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[36:39], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index 45f0af8d423b67..46c7c2b08cd64b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -22,9 +22,6 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
 ; GFX7-LABEL: constant_load_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
@@ -100,9 +97,6 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-LABEL: constant_load_v2i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-NEXT:    v_mov_b32_e32 v4, s0
@@ -189,9 +183,6 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-LABEL: constant_load_v3i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[2:3], 0x4
 ; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
@@ -308,9 +299,6 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX7-LABEL: constant_load_v4i64:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -442,9 +430,6 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-LABEL: constant_load_v8i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-NEXT:    s_add_u32 s18, s16, 48
@@ -664,9 +649,6 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
 ; GFX7-LABEL: constant_load_v16i64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_load_dwordx4 s[36:39], s[6:7], 0x0
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_load_dwordx16 s[16:31], s[38:39], 0x10
 ; GFX7-NEXT:    s_load_dwordx16 s[0:15], s[38:39], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index 22638d621932e1..67a376b8c0f3c5 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -27,9 +27,6 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
 ; GFX7-HSA-LABEL: constant_load_i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -117,9 +114,6 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v2i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -205,9 +199,6 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v3i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -320,9 +311,6 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v4i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -394,9 +382,6 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
 ; GFX7-HSA-LABEL: constant_load_v8i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -473,9 +458,6 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
 ; GFX7-HSA-LABEL: constant_load_v16i8:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -559,9 +541,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_zextload_i8_to_i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -639,9 +618,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_sextload_i8_to_i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -720,9 +696,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -800,9 +773,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -884,9 +854,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -987,9 +954,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1090,9 +1054,6 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1196,9 +1157,6 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32:
 ; GFX7-HSA:       ; %bb.0: ; %entry
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, s0
@@ -1304,9 +1262,6 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1413,9 +1368,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -1535,9 +1487,6 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1697,9 +1646,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1886,9 +1832,6 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2153,9 +2096,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2476,9 +2416,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2955,9 +2892,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3550,9 +3484,6 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -4460,9 +4391,6 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[16:19], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -5286,9 +5214,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_zextload_i8_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5373,9 +5298,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_sextload_i8_to_i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5463,9 +5385,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5548,9 +5467,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5641,9 +5557,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5752,9 +5665,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5870,13 +5780,10 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_bfe_u32 s4, s2, 0x80008
 ; GFX7-HSA-NEXT:    s_lshr_b32 s3, s2, 24
@@ -6014,9 +5921,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6178,13 +6082,10 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_lshr_b32 s4, s2, 24
 ; GFX7-HSA-NEXT:    s_lshr_b32 s5, s3, 24
@@ -6405,9 +6306,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -6680,13 +6578,10 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_lshr_b32 s8, s5, 24
 ; GFX7-HSA-NEXT:    s_lshr_b32 s9, s4, 24
@@ -7073,9 +6968,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -7563,13 +7455,10 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GFX7-HSA-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_lshr_b32 s12, s4, 24
 ; GFX7-HSA-NEXT:    s_lshr_b32 s13, s5, 24
@@ -8309,9 +8198,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64:
 ; GFX7-HSA:       ; %bb.0:
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -9064,9 +8950,6 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_zextload_i8_to_i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9153,9 +9036,6 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
 ; GFX7-HSA-LABEL: constant_sextload_i8_to_i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9244,9 +9124,6 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9333,9 +9210,6 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9427,9 +9301,6 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9531,9 +9402,6 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -9645,9 +9513,6 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -9786,9 +9651,6 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v0, s0
@@ -9945,9 +9807,6 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -10153,9 +10012,6 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
 ; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    v_mov_b32_e32 v4, s0
@@ -10399,9 +10255,6 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -10765,9 +10618,6 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
 ; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16:
 ; GFX7-HSA:       ; %bb.0:
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11204,9 +11054,6 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ;
 ; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16:
 ; GFX7-HSA:       ; %bb.0:
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -11882,9 +11729,6 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
 ;
 ; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16:
 ; GFX7-HSA:       ; %bb.0:
-; GFX7-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-HSA-NEXT:    s_load_dwordx4 s[8:11], s[6:7], 0x0
 ; GFX7-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-HSA-NEXT:    s_load_dwordx8 s[0:7], s[10:11], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 065e92d545cc96..4cc47b09d813d6 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -28,9 +28,6 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(
 ; GCN-HSA-LABEL: global_load_i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -136,9 +133,6 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v2i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -225,9 +219,6 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v3i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -359,9 +350,6 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v4i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -447,9 +435,6 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac
 ; GCN-HSA-LABEL: global_load_v8i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -538,9 +523,6 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
 ; GCN-HSA-LABEL: global_load_v16i16:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
@@ -691,9 +673,6 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
 ; GCN-HSA-LABEL: global_load_v16i16_align2:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
@@ -843,9 +822,6 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_zextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -931,9 +907,6 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_sextload_i16_to_i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1022,9 +995,6 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1110,9 +1080,6 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1203,9 +1170,6 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1305,9 +1269,6 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1409,9 +1370,6 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1522,9 +1480,6 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32:
 ; GCN-HSA:       ; %bb.0: ; %entry
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1642,9 +1597,6 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1760,9 +1712,6 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1885,9 +1834,6 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2037,9 +1983,6 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2204,9 +2147,6 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -2443,9 +2383,6 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2717,9 +2654,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -3131,9 +3065,6 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -3653,9 +3584,6 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -4459,9 +4387,6 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5226,9 +5151,6 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_zextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5326,9 +5248,6 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr
 ; GCN-HSA-LABEL: global_sextload_i16_to_i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5424,9 +5343,6 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5519,9 +5435,6 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5620,9 +5533,6 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5732,9 +5642,6 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -5853,9 +5760,6 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6001,9 +5905,6 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6164,10 +6065,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v4
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6182,11 +6083,8 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v4
@@ -6386,9 +6284,6 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6639,10 +6534,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -6659,10 +6554,7 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v8
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v8
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v8
@@ -7022,9 +6914,6 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -7496,9 +7385,6 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -8201,9 +8087,6 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCN-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 88b67e2427fbeb..7cdf270810dea0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -27,9 +27,6 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(
 ; GCNX3-HSA-LABEL: global_load_i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -109,9 +106,6 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v2i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -192,9 +186,6 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v3i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -279,9 +270,6 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v4i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -364,9 +352,6 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v8i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, s2
@@ -473,9 +458,6 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac
 ; GCNX3-HSA-LABEL: global_load_v9i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -607,9 +589,6 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v10i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -740,9 +719,6 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v11i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -878,9 +854,6 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v12i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
@@ -1014,9 +987,6 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v16i32:
 ; GCNX3-HSA:       ; %bb.0: ; %entry
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -1164,9 +1134,6 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr
 ; GCNX3-HSA-LABEL: global_zextload_i32_to_i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1250,9 +1217,6 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr
 ; GCNX3-HSA-LABEL: global_sextload_i32_to_i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1337,9 +1301,6 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1423,9 +1384,6 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1513,9 +1471,6 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1613,9 +1568,6 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1718,10 +1670,8 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v5, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, v5
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1729,7 +1679,6 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, s3
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v7, v5
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v4, v2
@@ -1847,9 +1796,6 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -1991,10 +1937,8 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v9, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, v9
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2009,7 +1953,6 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v15, s3
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v14, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 48
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v11, v9
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v13, s1
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v12, s0
@@ -2187,9 +2130,6 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out,
 ; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2426,9 +2366,6 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -2790,10 +2727,8 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v17, 0
+; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, v17
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -2827,7 +2762,6 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v21, s1
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v26, s2
 ; GCNX3-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCNX3-HSA-NEXT:    v_mov_b32_e32 v19, v17
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v20, s0
 ; GCNX3-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCNX3-HSA-NEXT:    s_waitcnt vmcnt(3)
@@ -3181,9 +3115,6 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCNX3-HSA-NEXT:    v_mov_b32_e32 v1, s3
@@ -3655,7 +3586,7 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCN-GFX900-HSA-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-GFX900-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GCN-GFX900-HSA-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-GFX900-HSA-NEXT:    s_add_u32 s16, s16, s15
+; GCN-GFX900-HSA-NEXT:    s_add_u32 s16, s16, s13
 ; GCN-GFX900-HSA-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-GFX900-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-GFX900-HSA-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
@@ -3971,9 +3902,6 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
 ; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
@@ -4498,9 +4426,6 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
 ; GCNX3-HSA-LABEL: global_load_v32i32:
 ; GCNX3-HSA:       ; %bb.0:
 ; GCNX3-HSA-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCNX3-HSA-NEXT:    s_add_i32 s10, s10, s15
-; GCNX3-HSA-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCNX3-HSA-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCNX3-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCNX3-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCNX3-HSA-NEXT:    s_addc_u32 s5, s3, 0
diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index 1a6fa3c518ca7a..4dfc773d615e4f 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -13,8 +13,7 @@
 ; GCN: s_cselect_b32
 
 ; GCN-NOT: load_dword
-; GCN: flat_load_dword
-; GCN: flat_load_dword
+; GCN: flat_load_dwordx2
 ; GCN-NOT: load_dword
 
 ; GCN: flat_store_dwordx2
diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
index 245a2775d9f2fb..e876a8d9dda692 100644
--- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -9,7 +9,7 @@ declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
 
 ; GCN-LABEL: {{^}}get_global_id_0:
 ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
-; GCN: s_mul_i32 [[MUL:s[0-9]+]], s12, [[WGSIZEX]]
+; GCN: s_mul_i32 [[MUL:s[0-9]+]], s10, [[WGSIZEX]]
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, [[MUL]], v0
 define amdgpu_kernel void @get_global_id_0(ptr addrspace(1) %out) #1 {
   %dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
index e8632871f56ea4..92536c2078514a 100644
--- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
@@ -11,8 +11,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX9-NEXT:    s_mul_i32 s12, s12, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s12
+; GFX9-NEXT:    s_mul_i32 s10, s10, s4
+; GFX9-NEXT:    s_add_i32 s5, s5, s10
 ; GFX9-NEXT:    v_add_u32_e32 v0, s5, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 4, v[0:1]
@@ -39,8 +39,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xffff
-; GFX10-NEXT:    s_mul_i32 s12, s12, s4
-; GFX10-NEXT:    v_add3_u32 v0, s5, s12, v0
+; GFX10-NEXT:    s_mul_i32 s10, s10, s4
+; GFX10-NEXT:    v_add3_u32 v0, s5, s10, v0
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], 4, v[0:1]
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, s0, v4
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 0348737a41a30f..0a76e169e9c385 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -9,8 +9,6 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
 ; CHECK-LABEL: memcpy_p0_p0_minsize:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s3
@@ -188,7 +186,7 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_add_u32 s16, s16, s15
+; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
 ; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
@@ -568,9 +566,7 @@ define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x8
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_add_u32 s16, s16, s15
+; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
@@ -985,8 +981,6 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
 ; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
 ; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
@@ -1261,8 +1255,6 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
 ; CHECK-LABEL: memcpy_p0_p0_optsize:
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s3
@@ -1440,7 +1432,7 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
 ; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x8
 ; CHECK-NEXT:    s_load_dword s2, s[6:7], 0x0
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    s_add_u32 s16, s16, s15
+; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    global_load_ubyte v2, v0, s[0:1] offset:15
 ; CHECK-NEXT:    global_load_ubyte v3, v0, s[0:1] offset:14
@@ -1820,9 +1812,7 @@ define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %
 ; CHECK-NEXT:    s_mov_b64 s[18:19], s[2:3]
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x8
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
-; CHECK-NEXT:    s_add_u32 s16, s16, s15
+; CHECK-NEXT:    s_add_u32 s16, s16, s13
 ; CHECK-NEXT:    s_addc_u32 s17, s17, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
@@ -2237,8 +2227,6 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
 ; CHECK-NEXT:    ds_read_u8 v8, v2 offset:117
 ; CHECK-NEXT:    ds_read_u8 v9, v2 offset:118
 ; CHECK-NEXT:    ds_read_u8 v10, v2 offset:119
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index b201e6ddd7a5b7..45e8b3bcff13c5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -15,9 +15,6 @@
 define amdgpu_kernel void @flat_agent_unordered_load(
 ; GFX7-LABEL: flat_agent_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -215,9 +200,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_load(
 ; GFX7-LABEL: flat_agent_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -233,10 +215,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -253,10 +231,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -288,8 +262,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -304,8 +276,6 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -415,9 +385,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_load(
 ; GFX7-LABEL: flat_agent_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -434,10 +401,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -456,10 +419,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -493,8 +452,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -510,8 +467,6 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -634,9 +589,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_load(
 ; GFX7-LABEL: flat_agent_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -654,10 +606,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -678,10 +626,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -718,8 +662,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -736,8 +678,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -875,9 +815,6 @@ entry:
 define amdgpu_kernel void @flat_agent_unordered_store(
 ; GFX7-LABEL: flat_agent_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -889,10 +826,6 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -905,10 +838,6 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -932,8 +861,6 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -945,8 +872,6 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1030,9 +955,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_store(
 ; GFX7-LABEL: flat_agent_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1044,10 +966,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1060,10 +978,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1087,8 +1001,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1100,8 +1012,6 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1185,9 +1095,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_store(
 ; GFX7-LABEL: flat_agent_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1200,10 +1107,6 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1218,10 +1121,6 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1248,8 +1147,6 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1262,8 +1159,6 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1366,9 +1261,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_store(
 ; GFX7-LABEL: flat_agent_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1381,10 +1273,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1399,10 +1287,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1429,8 +1313,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1443,8 +1325,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1547,9 +1427,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1562,10 +1439,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1579,10 +1452,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1608,8 +1477,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1622,8 +1489,6 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1714,9 +1579,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ; GFX7-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1731,10 +1593,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1752,10 +1610,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1786,8 +1640,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1802,8 +1654,6 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1912,9 +1762,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ; GFX7-LABEL: flat_agent_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1928,10 +1775,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1947,10 +1790,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1979,8 +1818,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1994,8 +1831,6 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2105,9 +1940,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2123,10 +1955,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2146,10 +1974,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2183,8 +2007,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2200,8 +2022,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2329,9 +2149,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2347,10 +2164,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2370,10 +2183,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2407,8 +2216,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2424,8 +2231,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2553,9 +2358,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2572,10 +2374,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2594,10 +2392,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2631,8 +2425,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2648,8 +2440,6 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2772,9 +2562,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2792,10 +2579,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2816,10 +2599,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2856,8 +2635,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2874,8 +2651,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3017,9 +2792,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3037,10 +2809,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3061,10 +2829,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3101,8 +2865,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3119,8 +2881,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3262,9 +3022,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3290,10 +3047,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3319,10 +3072,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3373,8 +3122,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3390,8 +3137,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3508,9 +3253,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3538,10 +3280,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3571,10 +3309,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3630,8 +3364,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3649,8 +3381,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3785,9 +3515,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3814,10 +3541,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3845,10 +3568,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3902,8 +3621,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3920,8 +3637,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4057,9 +3772,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4088,10 +3800,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4123,10 +3831,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4185,8 +3889,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4205,8 +3907,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4360,9 +4060,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4391,10 +4088,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4426,10 +4119,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4488,8 +4177,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4508,8 +4195,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4663,9 +4348,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4693,10 +4375,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4726,10 +4404,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4785,8 +4459,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4804,8 +4476,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4940,9 +4610,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4970,10 +4637,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5003,10 +4666,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5062,8 +4721,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5081,8 +4738,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5217,9 +4872,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5248,10 +4900,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5283,10 +4931,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5345,8 +4989,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5365,8 +5007,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5520,9 +5160,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5551,10 +5188,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5586,10 +5219,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5648,8 +5277,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5668,8 +5295,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5823,9 +5448,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5854,10 +5476,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5889,10 +5507,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5951,8 +5565,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5971,8 +5583,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6126,9 +5736,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6157,10 +5764,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6192,10 +5795,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6254,8 +5853,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6274,8 +5871,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6429,9 +6024,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6460,10 +6052,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6495,10 +6083,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6557,8 +6141,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6577,8 +6159,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6732,9 +6312,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6763,10 +6340,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6798,10 +6371,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6860,8 +6429,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6880,8 +6447,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7035,9 +6600,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7066,10 +6628,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7101,10 +6659,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7163,8 +6717,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7183,8 +6735,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7338,9 +6888,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7369,10 +6916,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7404,10 +6947,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7466,8 +7005,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7486,8 +7023,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7641,9 +7176,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7672,10 +7204,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7704,10 +7232,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7764,8 +7288,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7784,8 +7306,6 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7929,9 +7449,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7961,10 +7478,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7995,10 +7508,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8057,8 +7566,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8078,8 +7585,6 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8236,9 +7741,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8268,10 +7770,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8302,10 +7800,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8365,8 +7859,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8386,8 +7878,6 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8550,9 +8040,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8583,10 +8070,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8619,10 +8102,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8684,8 +8163,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8706,8 +8183,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8883,9 +8358,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8916,10 +8388,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8952,10 +8420,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9017,8 +8481,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9039,8 +8501,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9216,9 +8676,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9248,10 +8705,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9282,10 +8735,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9344,8 +8793,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9365,8 +8812,6 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9523,9 +8968,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9555,10 +8997,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9589,10 +9027,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9651,8 +9085,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9672,8 +9104,6 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9830,9 +9260,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9863,10 +9290,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9899,10 +9322,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9964,8 +9383,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9986,8 +9403,6 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10163,9 +9578,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10196,10 +9608,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10232,10 +9640,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10297,8 +9701,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10319,8 +9721,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10496,9 +9896,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10529,10 +9926,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10565,10 +9958,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10630,8 +10019,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10652,8 +10039,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10829,9 +10214,6 @@ entry:
 define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10862,10 +10244,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10898,10 +10276,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10963,8 +10337,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10985,8 +10357,6 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11162,9 +10532,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11195,10 +10562,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11231,10 +10594,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11296,8 +10655,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11318,8 +10675,6 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11495,9 +10850,6 @@ entry:
 define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11528,10 +10880,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11564,10 +10912,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11629,8 +10973,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11651,8 +10993,6 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11828,9 +11168,6 @@ entry:
 define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11861,10 +11198,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11897,10 +11230,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11962,8 +11291,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11984,8 +11311,6 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12161,9 +11486,6 @@ entry:
 define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -12194,10 +11516,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -12230,10 +11548,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -12295,8 +11609,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12317,8 +11629,6 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12494,9 +11804,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ; GFX7-LABEL: flat_agent_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12512,10 +11819,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12532,10 +11835,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12567,8 +11866,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12583,8 +11880,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12694,9 +11989,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12712,10 +12004,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12732,10 +12020,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12767,8 +12051,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12783,8 +12065,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12894,9 +12174,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ; GFX7-LABEL: flat_agent_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12914,10 +12191,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12937,10 +12210,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12976,8 +12245,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12994,8 +12261,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13123,9 +12388,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -13144,10 +12406,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13169,10 +12427,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13211,8 +12465,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13230,8 +12482,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13374,9 +12624,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ; GFX7-LABEL: flat_agent_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13388,10 +12635,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13404,10 +12647,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13431,8 +12670,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13444,8 +12681,6 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13529,9 +12764,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13543,10 +12775,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13559,10 +12787,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13586,8 +12810,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13599,8 +12821,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13684,9 +12904,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_store(
 ; GFX7-LABEL: flat_agent_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13699,10 +12916,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13717,10 +12930,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13747,8 +12956,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13761,8 +12968,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13865,9 +13070,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13880,10 +13082,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13898,10 +13096,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13928,8 +13122,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13942,8 +13134,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -14046,9 +13236,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14061,10 +13248,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14078,10 +13261,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14107,8 +13286,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14121,8 +13298,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14213,9 +13388,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14230,10 +13402,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14250,10 +13418,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14283,8 +13447,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14299,8 +13461,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14407,9 +13567,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14423,10 +13580,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14442,10 +13595,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14474,8 +13623,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14489,8 +13636,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14600,9 +13745,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14618,10 +13760,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14640,10 +13778,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14676,8 +13810,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14693,8 +13825,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14820,9 +13950,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14838,10 +13965,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14860,10 +13983,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14896,8 +14015,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14913,8 +14030,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -15040,9 +14155,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15060,10 +14172,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15083,10 +14191,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15122,8 +14226,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15140,8 +14242,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15269,9 +14369,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15290,10 +14387,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15315,10 +14408,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15357,8 +14446,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15376,8 +14463,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15524,9 +14609,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15545,10 +14627,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15570,10 +14648,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15612,8 +14686,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15631,8 +14703,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15779,9 +14849,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15807,10 +14874,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15836,10 +14899,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15890,8 +14949,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15907,8 +14964,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16025,9 +15080,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16055,10 +15107,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16087,10 +15135,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16145,8 +15189,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16164,8 +15206,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16298,9 +15338,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16327,10 +15364,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16358,10 +15391,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16415,8 +15444,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16433,8 +15460,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16570,9 +15595,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16601,10 +15623,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16635,10 +15653,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16696,8 +15710,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16716,8 +15728,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16869,9 +15879,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16900,10 +15907,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16934,10 +15937,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16995,8 +15994,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17015,8 +16012,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17168,9 +16163,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17198,10 +16190,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17230,10 +16218,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17288,8 +16272,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17307,8 +16289,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17441,9 +16421,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17471,10 +16448,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17503,10 +16476,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17561,8 +16530,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17580,8 +16547,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17714,9 +16679,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17745,10 +16707,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17779,10 +16737,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17840,8 +16794,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17860,8 +16812,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18013,9 +16963,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18044,10 +16991,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18078,10 +17021,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18139,8 +17078,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18159,8 +17096,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18312,9 +17247,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18343,10 +17275,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18377,10 +17305,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18438,8 +17362,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18458,8 +17380,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18611,9 +17531,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18642,10 +17559,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18676,10 +17589,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18737,8 +17646,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18757,8 +17664,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18910,9 +17815,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18941,10 +17843,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18975,10 +17873,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19036,8 +17930,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19056,8 +17948,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19209,9 +18099,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -19240,10 +18127,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19274,10 +18157,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19335,8 +18214,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19355,8 +18232,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19508,9 +18383,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -19539,10 +18411,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19573,10 +18441,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19634,8 +18498,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19654,8 +18516,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19807,9 +18667,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -19838,10 +18695,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19872,10 +18725,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19933,8 +18782,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19953,8 +18800,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20106,9 +18951,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20137,10 +18979,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20169,10 +19007,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20229,8 +19063,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20249,8 +19081,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20394,9 +19224,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20427,10 +19254,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20462,10 +19285,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20526,8 +19345,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20548,8 +19365,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20711,9 +19526,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20743,10 +19555,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20777,10 +19585,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20840,8 +19644,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20861,8 +19663,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21025,9 +19825,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21059,10 +19856,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21096,10 +19889,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21163,8 +19952,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21186,8 +19973,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21368,9 +20153,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21402,10 +20184,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21439,10 +20217,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21506,8 +20280,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21529,8 +20301,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21711,9 +20481,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21744,10 +20511,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21779,10 +20542,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21843,8 +20602,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21865,8 +20622,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22028,9 +20783,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22061,10 +20813,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22096,10 +20844,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22160,8 +20904,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22182,8 +20924,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22345,9 +21085,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22379,10 +21116,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22416,10 +21149,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22483,8 +21212,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22506,8 +21233,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22688,9 +21413,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22722,10 +21444,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22759,10 +21477,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22826,8 +21540,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22849,8 +21561,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23031,9 +21741,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23065,10 +21772,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23102,10 +21805,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23169,8 +21868,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23192,8 +21889,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23374,9 +22069,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23408,10 +22100,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23445,10 +22133,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23512,8 +22196,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23535,8 +22217,6 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23717,9 +22397,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23751,10 +22428,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23788,10 +22461,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23855,8 +22524,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23878,8 +22545,6 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24060,9 +22725,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -24094,10 +22756,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24131,10 +22789,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24198,8 +22852,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24221,8 +22873,6 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24403,9 +23053,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -24437,10 +23084,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24474,10 +23117,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24541,8 +23180,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24564,8 +23201,6 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24746,9 +23381,6 @@ entry:
 define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -24780,10 +23412,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24817,10 +23445,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24884,8 +23508,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24907,8 +23529,6 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 5a89e1269526bd..5fa8e6891bafb1 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -15,9 +15,6 @@
 define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX7-LABEL: flat_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -215,9 +200,6 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX7-LABEL: flat_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -246,10 +228,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
@@ -277,10 +255,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_mov_b32 s6, 2
@@ -336,8 +310,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
@@ -367,8 +339,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b32 s6, 0x3ff
@@ -569,9 +539,6 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX7-LABEL: flat_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -587,10 +554,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -607,10 +570,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -642,8 +601,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -658,8 +615,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -769,9 +724,6 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX7-LABEL: flat_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -800,10 +752,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -832,10 +780,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -892,8 +836,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -924,8 +866,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1127,9 +1067,6 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ; GFX7-LABEL: flat_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -1146,10 +1083,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1167,10 +1100,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1204,8 +1133,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1221,8 +1148,6 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index f39801b6b7ade9..4c9ce15211e34c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -15,9 +15,6 @@
 define amdgpu_kernel void @flat_singlethread_unordered_load(
 ; GFX7-LABEL: flat_singlethread_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -215,9 +200,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ; GFX7-LABEL: flat_singlethread_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -233,10 +215,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -253,10 +231,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -288,8 +262,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -304,8 +276,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -415,9 +385,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_load(
 ; GFX7-LABEL: flat_singlethread_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -433,10 +400,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -453,10 +416,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -488,8 +447,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -504,8 +461,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -615,9 +570,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ; GFX7-LABEL: flat_singlethread_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -633,10 +585,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -653,10 +601,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -688,8 +632,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -704,8 +646,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -815,9 +755,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_unordered_store(
 ; GFX7-LABEL: flat_singlethread_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -829,10 +766,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -845,10 +778,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -872,8 +801,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -885,8 +812,6 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -970,9 +895,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ; GFX7-LABEL: flat_singlethread_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -984,10 +906,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1000,10 +918,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1027,8 +941,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1040,8 +952,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1125,9 +1035,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_store(
 ; GFX7-LABEL: flat_singlethread_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1139,10 +1046,6 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1155,10 +1058,6 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1182,8 +1081,6 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1195,8 +1092,6 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1280,9 +1175,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ; GFX7-LABEL: flat_singlethread_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1294,10 +1186,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1310,10 +1198,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1337,8 +1221,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1350,8 +1232,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1435,9 +1315,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1450,10 +1327,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1467,10 +1340,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1496,8 +1365,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1510,8 +1377,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1602,9 +1467,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1617,10 +1479,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1634,10 +1492,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1663,8 +1517,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1677,8 +1529,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1769,9 +1619,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1784,10 +1631,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1801,10 +1644,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1830,8 +1669,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1844,8 +1681,6 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1936,9 +1771,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1951,10 +1783,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1968,10 +1796,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1997,8 +1821,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2011,8 +1833,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2103,9 +1923,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2118,10 +1935,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2135,10 +1948,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2164,8 +1973,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2178,8 +1985,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2270,9 +2075,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2288,10 +2090,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2308,10 +2106,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2343,8 +2137,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2359,8 +2151,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2470,9 +2260,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2488,10 +2275,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2508,10 +2291,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2543,8 +2322,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2559,8 +2336,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2670,9 +2445,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2688,10 +2460,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2708,10 +2476,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2743,8 +2507,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2759,8 +2521,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2870,9 +2630,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2898,10 +2655,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2927,10 +2680,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2981,8 +2730,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2998,8 +2745,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3116,9 +2861,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3144,10 +2886,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3173,10 +2911,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3227,8 +2961,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3244,8 +2976,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3362,9 +3092,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3390,10 +3117,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3419,10 +3142,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3473,8 +3192,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3490,8 +3207,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3608,9 +3323,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3636,10 +3348,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3665,10 +3373,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3719,8 +3423,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3736,8 +3438,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3854,9 +3554,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3882,10 +3579,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3911,10 +3604,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3965,8 +3654,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3982,8 +3669,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4100,9 +3785,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4128,10 +3810,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4157,10 +3835,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4211,8 +3885,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4228,8 +3900,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4346,9 +4016,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4374,10 +4041,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4403,10 +4066,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4457,8 +4116,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4474,8 +4131,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4592,9 +4247,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4620,10 +4272,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4649,10 +4297,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4703,8 +4347,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4720,8 +4362,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4838,9 +4478,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4866,10 +4503,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4895,10 +4528,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4949,8 +4578,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4966,8 +4593,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5084,9 +4709,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5112,10 +4734,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5141,10 +4759,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5195,8 +4809,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5212,8 +4824,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5330,9 +4940,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5358,10 +4965,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5387,10 +4990,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5441,8 +5040,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5458,8 +5055,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5576,9 +5171,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5604,10 +5196,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5633,10 +5221,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5687,8 +5271,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5704,8 +5286,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5822,9 +5402,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5850,10 +5427,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5879,10 +5452,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5933,8 +5502,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5950,8 +5517,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6068,9 +5633,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6096,10 +5658,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6125,10 +5683,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6179,8 +5733,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6196,8 +5748,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6314,9 +5864,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6342,10 +5889,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6371,10 +5914,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6425,8 +5964,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6442,8 +5979,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6560,9 +6095,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6591,10 +6123,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6623,10 +6151,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6683,8 +6207,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6703,8 +6225,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6848,9 +6368,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6879,10 +6396,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6911,10 +6424,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6971,8 +6480,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6991,8 +6498,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7136,9 +6641,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7167,10 +6669,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7199,10 +6697,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7259,8 +6753,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7279,8 +6771,6 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7424,9 +6914,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7455,10 +6942,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7487,10 +6970,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7547,8 +7026,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7567,8 +7044,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7712,9 +7187,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7743,10 +7215,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7775,10 +7243,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7835,8 +7299,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7855,8 +7317,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8000,9 +7460,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8031,10 +7488,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8063,10 +7516,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8123,8 +7572,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8143,8 +7590,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8288,9 +7733,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8319,10 +7761,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8351,10 +7789,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8411,8 +7845,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8431,8 +7863,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8576,9 +8006,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8607,10 +8034,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8639,10 +8062,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8699,8 +8118,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8719,8 +8136,6 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8864,9 +8279,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8895,10 +8307,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8927,10 +8335,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8987,8 +8391,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9007,8 +8409,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9152,9 +8552,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9183,10 +8580,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9215,10 +8608,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9275,8 +8664,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9295,8 +8682,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9440,9 +8825,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9471,10 +8853,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9503,10 +8881,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9563,8 +8937,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9583,8 +8955,6 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9728,9 +9098,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9759,10 +9126,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9791,10 +9154,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9851,8 +9210,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9871,8 +9228,6 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10016,9 +9371,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10047,10 +9399,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10079,10 +9427,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10139,8 +9483,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10159,8 +9501,6 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10304,9 +9644,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10335,10 +9672,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10367,10 +9700,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10427,8 +9756,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10447,8 +9774,6 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10592,9 +9917,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10623,10 +9945,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10655,10 +9973,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10715,8 +10029,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10735,8 +10047,6 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10880,9 +10190,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ; GFX7-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10898,10 +10205,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10918,10 +10221,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10953,8 +10252,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10969,8 +10266,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11080,9 +10375,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11098,10 +10390,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11118,10 +10406,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11153,8 +10437,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11169,8 +10451,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11280,9 +10560,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11298,10 +10575,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11318,10 +10591,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11353,8 +10622,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11369,8 +10636,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11480,9 +10745,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11498,10 +10760,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11518,10 +10776,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11553,8 +10807,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11569,8 +10821,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11680,9 +10930,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ; GFX7-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11694,10 +10941,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11710,10 +10953,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11737,8 +10976,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11750,8 +10987,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11835,9 +11070,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11849,10 +11081,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11865,10 +11093,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11892,8 +11116,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11905,8 +11127,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11990,9 +11210,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ; GFX7-LABEL: flat_singlethread_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12004,10 +11221,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12020,10 +11233,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12047,8 +11256,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12060,8 +11267,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12145,9 +11350,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12159,10 +11361,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12175,10 +11373,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12202,8 +11396,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12215,8 +11407,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12300,9 +11490,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12315,10 +11502,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12332,10 +11515,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12361,8 +11540,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12375,8 +11552,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12467,9 +11642,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12482,10 +11654,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12499,10 +11667,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12528,8 +11692,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12542,8 +11704,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12634,9 +11794,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12649,10 +11806,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12666,10 +11819,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12695,8 +11844,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12709,8 +11856,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12801,9 +11946,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12816,10 +11958,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12833,10 +11971,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12862,8 +11996,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12876,8 +12008,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12968,9 +12098,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12983,10 +12110,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13000,10 +12123,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13029,8 +12148,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13043,8 +12160,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13135,9 +12250,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13153,10 +12265,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13173,10 +12281,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13208,8 +12312,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13224,8 +12326,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13335,9 +12435,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13353,10 +12450,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13373,10 +12466,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13408,8 +12497,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13424,8 +12511,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13535,9 +12620,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13553,10 +12635,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13573,10 +12651,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13608,8 +12682,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13624,8 +12696,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13735,9 +12805,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13763,10 +12830,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13792,10 +12855,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13846,8 +12905,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13863,8 +12920,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13981,9 +13036,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14009,10 +13061,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14038,10 +13086,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14092,8 +13136,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14109,8 +13151,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14227,9 +13267,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14255,10 +13292,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14284,10 +13317,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14338,8 +13367,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14355,8 +13382,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14473,9 +13498,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14501,10 +13523,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14530,10 +13548,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14584,8 +13598,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14601,8 +13613,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14719,9 +13729,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14747,10 +13754,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14776,10 +13779,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14830,8 +13829,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14847,8 +13844,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14965,9 +13960,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14993,10 +13985,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15022,10 +14010,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15076,8 +14060,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15093,8 +14075,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15211,9 +14191,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15239,10 +14216,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15268,10 +14241,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15322,8 +14291,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15339,8 +14306,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15457,9 +14422,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15485,10 +14447,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15514,10 +14472,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15568,8 +14522,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15585,8 +14537,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15703,9 +14653,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15731,10 +14678,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15760,10 +14703,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15814,8 +14753,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15831,8 +14768,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15949,9 +14884,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15977,10 +14909,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16006,10 +14934,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16060,8 +14984,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16077,8 +14999,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16195,9 +15115,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16223,10 +15140,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16252,10 +15165,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16306,8 +15215,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16323,8 +15230,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16441,9 +15346,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16469,10 +15371,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16498,10 +15396,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16552,8 +15446,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16569,8 +15461,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16687,9 +15577,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16715,10 +15602,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16744,10 +15627,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16798,8 +15677,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16815,8 +15692,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16933,9 +15808,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16961,10 +15833,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16990,10 +15858,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17044,8 +15908,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17061,8 +15923,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17179,9 +16039,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17207,10 +16064,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17236,10 +16089,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17290,8 +16139,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17307,8 +16154,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17425,9 +16270,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17456,10 +16298,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17488,10 +16326,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17548,8 +16382,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17568,8 +16400,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17713,9 +16543,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17744,10 +16571,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17776,10 +16599,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17836,8 +16655,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17856,8 +16673,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18001,9 +16816,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18032,10 +16844,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18064,10 +16872,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18124,8 +16928,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18144,8 +16946,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18289,9 +17089,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18320,10 +17117,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18352,10 +17145,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18412,8 +17201,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18432,8 +17219,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18577,9 +17362,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18608,10 +17390,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18640,10 +17418,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18700,8 +17474,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18720,8 +17492,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18865,9 +17635,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18896,10 +17663,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18928,10 +17691,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18988,8 +17747,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19008,8 +17765,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19153,9 +17908,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19184,10 +17936,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19216,10 +17964,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19276,8 +18020,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19296,8 +18038,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19441,9 +18181,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19472,10 +18209,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19504,10 +18237,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19564,8 +18293,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19584,8 +18311,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19729,9 +18454,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19760,10 +18482,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19792,10 +18510,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19852,8 +18566,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19872,8 +18584,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20017,9 +18727,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20048,10 +18755,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20080,10 +18783,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20140,8 +18839,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20160,8 +18857,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20305,9 +19000,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20336,10 +19028,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20368,10 +19056,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20428,8 +19112,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20448,8 +19130,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20593,9 +19273,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20624,10 +19301,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20656,10 +19329,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20716,8 +19385,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20736,8 +19403,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20881,9 +19546,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20912,10 +19574,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20944,10 +19602,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21004,8 +19658,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21024,8 +19676,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21169,9 +19819,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21200,10 +19847,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21232,10 +19875,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21292,8 +19931,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21312,8 +19949,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21457,9 +20092,6 @@ entry:
 define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21488,10 +20120,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21520,10 +20148,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21580,8 +20204,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21600,8 +20222,6 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index ae359090ea5f35..e77f1432c1c9d0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -15,9 +15,6 @@
 define amdgpu_kernel void @flat_system_unordered_load(
 ; GFX7-LABEL: flat_system_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_system_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -215,9 +200,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_load(
 ; GFX7-LABEL: flat_system_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -233,10 +215,6 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -253,10 +231,6 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -288,8 +262,6 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -304,8 +276,6 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -415,9 +385,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_load(
 ; GFX7-LABEL: flat_system_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -434,10 +401,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -456,10 +419,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -493,8 +452,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -511,8 +468,6 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -636,9 +591,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_load(
 ; GFX7-LABEL: flat_system_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -656,10 +608,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -680,10 +628,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -720,8 +664,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -739,8 +681,6 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -879,9 +819,6 @@ entry:
 define amdgpu_kernel void @flat_system_unordered_store(
 ; GFX7-LABEL: flat_system_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -893,10 +830,6 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -909,10 +842,6 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_system_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -936,8 +865,6 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -949,8 +876,6 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1034,9 +959,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_store(
 ; GFX7-LABEL: flat_system_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1048,10 +970,6 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1064,10 +982,6 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1091,8 +1005,6 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1104,8 +1016,6 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1189,9 +1099,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_store(
 ; GFX7-LABEL: flat_system_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1204,10 +1111,6 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1222,10 +1125,6 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX10-CU-LABEL: flat_system_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1252,8 +1151,6 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1267,8 +1164,6 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1372,9 +1267,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_store(
 ; GFX7-LABEL: flat_system_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1387,10 +1279,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1405,10 +1293,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1435,8 +1319,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1450,8 +1332,6 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1555,9 +1435,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1570,10 +1447,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1587,10 +1460,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1616,8 +1485,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1630,8 +1497,6 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1722,9 +1587,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ; GFX7-LABEL: flat_system_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1739,10 +1601,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1760,10 +1618,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1794,8 +1648,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1811,8 +1663,6 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1922,9 +1772,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_atomicrmw(
 ; GFX7-LABEL: flat_system_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1938,10 +1785,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1957,10 +1800,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1989,8 +1828,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2005,8 +1842,6 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2117,9 +1952,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2135,10 +1967,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2158,10 +1986,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2195,8 +2019,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2214,8 +2036,6 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2345,9 +2165,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2363,10 +2180,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2386,10 +2199,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2423,8 +2232,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2442,8 +2249,6 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2573,9 +2378,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2592,10 +2394,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2614,10 +2412,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2651,8 +2445,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2669,8 +2461,6 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2794,9 +2584,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2814,10 +2601,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2838,10 +2621,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2878,8 +2657,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2898,8 +2675,6 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3043,9 +2818,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3063,10 +2835,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3087,10 +2855,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3127,8 +2891,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3147,8 +2909,6 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3292,9 +3052,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3320,10 +3077,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3349,10 +3102,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3403,8 +3152,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3420,8 +3167,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3538,9 +3283,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3568,10 +3310,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3601,10 +3339,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3660,8 +3394,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3680,8 +3412,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3817,9 +3547,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3846,10 +3573,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3877,10 +3600,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3934,8 +3653,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3953,8 +3670,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4091,9 +3806,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4122,10 +3834,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4157,10 +3865,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4219,8 +3923,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4241,8 +3943,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4398,9 +4098,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4429,10 +4126,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4464,10 +4157,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4526,8 +4215,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4548,8 +4235,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4705,9 +4390,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4735,10 +4417,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4768,10 +4446,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4827,8 +4501,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4847,8 +4519,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4984,9 +4654,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5014,10 +4681,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5047,10 +4710,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5106,8 +4765,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5126,8 +4783,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5263,9 +4918,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5294,10 +4946,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5329,10 +4977,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5391,8 +5035,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5413,8 +5055,6 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5570,9 +5210,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5601,10 +5238,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5636,10 +5269,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5698,8 +5327,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5720,8 +5347,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5877,9 +5502,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5908,10 +5530,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5943,10 +5561,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6005,8 +5619,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6027,8 +5639,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6184,9 +5794,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6215,10 +5822,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6250,10 +5853,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6312,8 +5911,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6334,8 +5931,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6491,9 +6086,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6522,10 +6114,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6557,10 +6145,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6619,8 +6203,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6641,8 +6223,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6798,9 +6378,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6829,10 +6406,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6864,10 +6437,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6926,8 +6495,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6948,8 +6515,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7105,9 +6670,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7136,10 +6698,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7171,10 +6729,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7233,8 +6787,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7255,8 +6807,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7412,9 +6962,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7443,10 +6990,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7478,10 +7021,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -7540,8 +7079,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7562,8 +7099,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7719,9 +7254,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7750,10 +7282,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7782,10 +7310,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7842,8 +7366,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7862,8 +7384,6 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8007,9 +7527,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8039,10 +7556,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8073,10 +7586,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8135,8 +7644,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8157,8 +7664,6 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8316,9 +7821,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8348,10 +7850,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8382,10 +7880,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8445,8 +7939,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8467,8 +7959,6 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8632,9 +8122,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8665,10 +8152,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8701,10 +8184,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8766,8 +8245,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8790,8 +8267,6 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8969,9 +8444,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9002,10 +8474,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9038,10 +8506,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9103,8 +8567,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9127,8 +8589,6 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9306,9 +8766,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9338,10 +8795,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9372,10 +8825,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9434,8 +8883,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9456,8 +8903,6 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9615,9 +9060,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9647,10 +9089,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9681,10 +9119,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9743,8 +9177,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9765,8 +9197,6 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9924,9 +9354,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9957,10 +9384,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9993,10 +9416,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10058,8 +9477,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10082,8 +9499,6 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10261,9 +9676,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10294,10 +9706,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10330,10 +9738,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10395,8 +9799,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10419,8 +9821,6 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10598,9 +9998,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10631,10 +10028,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10667,10 +10060,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10732,8 +10121,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10756,8 +10143,6 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10935,9 +10320,6 @@ entry:
 define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10968,10 +10350,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11004,10 +10382,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11069,8 +10443,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11093,8 +10465,6 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11272,9 +10642,6 @@ entry:
 define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11305,10 +10672,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11341,10 +10704,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11406,8 +10765,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11430,8 +10787,6 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11609,9 +10964,6 @@ entry:
 define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11642,10 +10994,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11678,10 +11026,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -11743,8 +11087,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11767,8 +11109,6 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -11946,9 +11286,6 @@ entry:
 define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11979,10 +11316,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -12015,10 +11348,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -12080,8 +11409,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12104,8 +11431,6 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12283,9 +11608,6 @@ entry:
 define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -12316,10 +11638,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -12352,10 +11670,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -12417,8 +11731,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12441,8 +11753,6 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -12620,9 +11930,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ; GFX7-LABEL: flat_system_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12638,10 +11945,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12658,10 +11961,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12693,8 +11992,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12709,8 +12006,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12820,9 +12115,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ; GFX7-LABEL: flat_system_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12838,10 +12130,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12858,10 +12146,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12893,8 +12177,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12909,8 +12191,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13020,9 +12300,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ; GFX7-LABEL: flat_system_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -13040,10 +12317,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13063,10 +12336,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13102,8 +12371,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13121,8 +12388,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13251,9 +12516,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -13272,10 +12534,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13297,10 +12555,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13339,8 +12593,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13359,8 +12611,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13504,9 +12754,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ; GFX7-LABEL: flat_system_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13518,10 +12765,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13534,10 +12777,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13561,8 +12800,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13574,8 +12811,6 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13659,9 +12894,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ; GFX7-LABEL: flat_system_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13673,10 +12905,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13689,10 +12917,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13716,8 +12940,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13729,8 +12951,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13814,9 +13034,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_store(
 ; GFX7-LABEL: flat_system_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13829,10 +13046,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13847,10 +13060,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13877,8 +13086,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13892,8 +13099,6 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -13997,9 +13202,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14012,10 +13214,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -14030,10 +13228,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -14060,8 +13254,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -14075,8 +13267,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -14180,9 +13370,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14195,10 +13382,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14212,10 +13395,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14241,8 +13420,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14255,8 +13432,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14347,9 +13522,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14364,10 +13536,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14384,10 +13552,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14417,8 +13581,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14434,8 +13596,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14543,9 +13703,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14559,10 +13716,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14578,10 +13731,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14610,8 +13759,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14626,8 +13773,6 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14738,9 +13883,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14756,10 +13898,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -14778,10 +13916,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -14814,8 +13948,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -14833,8 +13965,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -14962,9 +14092,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14980,10 +14107,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -15002,10 +14125,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -15038,8 +14157,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -15057,8 +14174,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -15186,9 +14301,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15206,10 +14318,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15229,10 +14337,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15268,8 +14372,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15287,8 +14389,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15417,9 +14517,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15438,10 +14535,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15463,10 +14556,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15505,8 +14594,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15526,8 +14613,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15676,9 +14761,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15697,10 +14779,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15722,10 +14800,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15764,8 +14838,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15785,8 +14857,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -15935,9 +15005,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15963,10 +15030,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15992,10 +15055,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16046,8 +15105,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16063,8 +15120,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16181,9 +15236,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16211,10 +15263,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16243,10 +15291,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16301,8 +15345,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16321,8 +15363,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16456,9 +15496,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16485,10 +15522,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16516,10 +15549,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16573,8 +15602,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16592,8 +15619,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16730,9 +15755,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16761,10 +15783,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16795,10 +15813,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16856,8 +15870,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16878,8 +15890,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17033,9 +16043,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17064,10 +16071,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17098,10 +16101,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17159,8 +16158,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17181,8 +16178,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17336,9 +16331,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17366,10 +16358,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17398,10 +16386,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17456,8 +16440,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17476,8 +16458,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17611,9 +16591,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17641,10 +16618,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17673,10 +16646,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17731,8 +16700,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17751,8 +16718,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17886,9 +16851,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17917,10 +16879,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17951,10 +16909,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18012,8 +16966,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18034,8 +16986,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18189,9 +17139,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18220,10 +17167,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18254,10 +17197,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18315,8 +17254,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18337,8 +17274,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18492,9 +17427,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18523,10 +17455,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18557,10 +17485,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18618,8 +17542,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18640,8 +17562,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18795,9 +17715,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18826,10 +17743,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18860,10 +17773,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -18921,8 +17830,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18943,8 +17850,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19098,9 +18003,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -19129,10 +18031,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19163,10 +18061,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19224,8 +18118,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19246,8 +18138,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19401,9 +18291,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -19432,10 +18319,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19466,10 +18349,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19527,8 +18406,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19549,8 +18426,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19704,9 +18579,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -19735,10 +18607,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19769,10 +18637,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -19830,8 +18694,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19852,8 +18714,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20007,9 +18867,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -20038,10 +18895,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -20072,10 +18925,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -20133,8 +18982,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20155,8 +19002,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20310,9 +19155,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20341,10 +19183,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20373,10 +19211,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20433,8 +19267,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20453,8 +19285,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20598,9 +19428,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20631,10 +19458,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20666,10 +19489,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20730,8 +19549,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20753,8 +19570,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20917,9 +19732,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20949,10 +19761,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20983,10 +19791,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21046,8 +19850,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21068,8 +19870,6 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21233,9 +20033,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21267,10 +20064,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21304,10 +20097,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21371,8 +20160,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21396,8 +20183,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21580,9 +20365,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21614,10 +20396,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21651,10 +20429,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21718,8 +20492,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21743,8 +20515,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21927,9 +20697,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21960,10 +20727,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21995,10 +20758,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22059,8 +20818,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22082,8 +20839,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22246,9 +21001,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22279,10 +21031,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22314,10 +21062,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22378,8 +21122,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22401,8 +21143,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22565,9 +21305,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22599,10 +21336,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22636,10 +21369,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22703,8 +21432,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22728,8 +21455,6 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22912,9 +21637,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22946,10 +21668,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22983,10 +21701,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23050,8 +21764,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23075,8 +21787,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23259,9 +21969,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23293,10 +22000,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23330,10 +22033,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23397,8 +22096,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23422,8 +22119,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23606,9 +22301,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23640,10 +22332,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23677,10 +22365,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -23744,8 +22428,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23769,8 +22451,6 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -23953,9 +22633,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -23987,10 +22664,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24024,10 +22697,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24091,8 +22760,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24116,8 +22783,6 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24300,9 +22965,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -24334,10 +22996,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24371,10 +23029,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24438,8 +23092,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24463,8 +23115,6 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24647,9 +23297,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -24681,10 +23328,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24718,10 +23361,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -24785,8 +23424,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24810,8 +23447,6 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -24994,9 +23629,6 @@ entry:
 define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -25028,10 +23660,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -25065,10 +23693,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -25132,8 +23756,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -25157,8 +23779,6 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index b6c1ce72f86918..6bf54ccabc9dad 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -11,9 +11,6 @@
 define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX7-LABEL: flat_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -30,10 +27,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -51,10 +44,6 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -163,9 +152,6 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_load_1(
 ; GFX7-LABEL: flat_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -195,10 +181,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_mov_b32 s6, 2
@@ -227,10 +209,6 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_mov_b32 s6, 2
@@ -421,9 +399,6 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_0(
 ; GFX7-LABEL: flat_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -440,10 +415,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -461,10 +432,6 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -577,9 +544,6 @@ entry:
 define amdgpu_kernel void @flat_nontemporal_store_1(
 ; GFX7-LABEL: flat_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -609,10 +573,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: flat_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -642,10 +602,6 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: flat_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -841,9 +797,6 @@ entry:
 define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ; GFX7-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -860,10 +813,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -881,10 +830,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -991,9 +936,6 @@ entry:
 define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ; GFX7-LABEL: flat_volatile_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1006,10 +948,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1024,10 +962,6 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ;
 ; GFX10-CU-LABEL: flat_volatile_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index 3b931330acdd7d..c7826181cc8dde 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -15,9 +15,6 @@
 define amdgpu_kernel void @flat_wavefront_unordered_load(
 ; GFX7-LABEL: flat_wavefront_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -215,9 +200,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ; GFX7-LABEL: flat_wavefront_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -233,10 +215,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -253,10 +231,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -288,8 +262,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -304,8 +276,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -415,9 +385,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_load(
 ; GFX7-LABEL: flat_wavefront_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -433,10 +400,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -453,10 +416,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -488,8 +447,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -504,8 +461,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -615,9 +570,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ; GFX7-LABEL: flat_wavefront_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -633,10 +585,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -653,10 +601,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -688,8 +632,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -704,8 +646,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -815,9 +755,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_unordered_store(
 ; GFX7-LABEL: flat_wavefront_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -829,10 +766,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -845,10 +778,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -872,8 +801,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -885,8 +812,6 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -970,9 +895,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ; GFX7-LABEL: flat_wavefront_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -984,10 +906,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1000,10 +918,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1027,8 +941,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1040,8 +952,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1125,9 +1035,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_store(
 ; GFX7-LABEL: flat_wavefront_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1139,10 +1046,6 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1155,10 +1058,6 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1182,8 +1081,6 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1195,8 +1092,6 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1280,9 +1175,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ; GFX7-LABEL: flat_wavefront_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1294,10 +1186,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1310,10 +1198,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1337,8 +1221,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1350,8 +1232,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1435,9 +1315,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1450,10 +1327,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1467,10 +1340,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1496,8 +1365,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1510,8 +1377,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1602,9 +1467,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1617,10 +1479,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1634,10 +1492,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1663,8 +1517,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1677,8 +1529,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1769,9 +1619,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1784,10 +1631,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1801,10 +1644,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1830,8 +1669,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1844,8 +1681,6 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1936,9 +1771,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1951,10 +1783,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1968,10 +1796,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1997,8 +1821,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2011,8 +1833,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2103,9 +1923,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2118,10 +1935,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2135,10 +1948,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2164,8 +1973,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2178,8 +1985,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2270,9 +2075,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2288,10 +2090,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2308,10 +2106,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2343,8 +2137,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2359,8 +2151,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2470,9 +2260,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2488,10 +2275,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2508,10 +2291,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2543,8 +2322,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2559,8 +2336,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2670,9 +2445,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2688,10 +2460,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2708,10 +2476,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2743,8 +2507,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2759,8 +2521,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2870,9 +2630,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -2898,10 +2655,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2927,10 +2680,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -2981,8 +2730,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -2998,8 +2745,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3116,9 +2861,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3144,10 +2886,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3173,10 +2911,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3227,8 +2961,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3244,8 +2976,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3362,9 +3092,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3390,10 +3117,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3419,10 +3142,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3473,8 +3192,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3490,8 +3207,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3608,9 +3323,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3636,10 +3348,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3665,10 +3373,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3719,8 +3423,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3736,8 +3438,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3854,9 +3554,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3882,10 +3579,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3911,10 +3604,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3965,8 +3654,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3982,8 +3669,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4100,9 +3785,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4128,10 +3810,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4157,10 +3835,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4211,8 +3885,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4228,8 +3900,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4346,9 +4016,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4374,10 +4041,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4403,10 +4066,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4457,8 +4116,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4474,8 +4131,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4592,9 +4247,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4620,10 +4272,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4649,10 +4297,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4703,8 +4347,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4720,8 +4362,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4838,9 +4478,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4866,10 +4503,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4895,10 +4528,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4949,8 +4578,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4966,8 +4593,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5084,9 +4709,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5112,10 +4734,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5141,10 +4759,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5195,8 +4809,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5212,8 +4824,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5330,9 +4940,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5358,10 +4965,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5387,10 +4990,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5441,8 +5040,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5458,8 +5055,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5576,9 +5171,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5604,10 +5196,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5633,10 +5221,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5687,8 +5271,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5704,8 +5286,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5822,9 +5402,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5850,10 +5427,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5879,10 +5452,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5933,8 +5502,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5950,8 +5517,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6068,9 +5633,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6096,10 +5658,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6125,10 +5683,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6179,8 +5733,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6196,8 +5748,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6314,9 +5864,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6342,10 +5889,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6371,10 +5914,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -6425,8 +5964,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6442,8 +5979,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6560,9 +6095,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6591,10 +6123,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6623,10 +6151,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6683,8 +6207,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6703,8 +6225,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6848,9 +6368,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6879,10 +6396,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6911,10 +6424,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6971,8 +6480,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6991,8 +6498,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7136,9 +6641,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7167,10 +6669,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7199,10 +6697,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7259,8 +6753,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7279,8 +6771,6 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7424,9 +6914,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7455,10 +6942,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7487,10 +6970,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7547,8 +7026,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7567,8 +7044,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7712,9 +7187,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7743,10 +7215,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7775,10 +7243,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7835,8 +7299,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7855,8 +7317,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8000,9 +7460,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8031,10 +7488,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8063,10 +7516,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8123,8 +7572,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8143,8 +7590,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8288,9 +7733,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8319,10 +7761,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8351,10 +7789,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8411,8 +7845,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8431,8 +7863,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8576,9 +8006,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8607,10 +8034,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8639,10 +8062,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8699,8 +8118,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8719,8 +8136,6 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8864,9 +8279,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8895,10 +8307,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8927,10 +8335,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8987,8 +8391,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9007,8 +8409,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9152,9 +8552,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9183,10 +8580,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9215,10 +8608,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9275,8 +8664,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9295,8 +8682,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9440,9 +8825,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9471,10 +8853,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9503,10 +8881,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9563,8 +8937,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9583,8 +8955,6 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9728,9 +9098,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9759,10 +9126,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9791,10 +9154,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9851,8 +9210,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9871,8 +9228,6 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10016,9 +9371,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10047,10 +9399,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10079,10 +9427,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10139,8 +9483,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10159,8 +9501,6 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10304,9 +9644,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10335,10 +9672,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10367,10 +9700,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10427,8 +9756,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10447,8 +9774,6 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10592,9 +9917,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10623,10 +9945,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10655,10 +9973,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10715,8 +10029,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10735,8 +10047,6 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10880,9 +10190,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ; GFX7-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10898,10 +10205,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10918,10 +10221,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10953,8 +10252,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10969,8 +10266,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11080,9 +10375,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11098,10 +10390,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11118,10 +10406,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11153,8 +10437,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11169,8 +10451,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11280,9 +10560,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11298,10 +10575,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11318,10 +10591,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11353,8 +10622,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11369,8 +10636,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11480,9 +10745,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11498,10 +10760,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11518,10 +10776,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11553,8 +10807,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11569,8 +10821,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11680,9 +10930,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ; GFX7-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11694,10 +10941,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11710,10 +10953,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11737,8 +10976,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11750,8 +10987,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11835,9 +11070,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11849,10 +11081,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11865,10 +11093,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11892,8 +11116,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11905,8 +11127,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11990,9 +11210,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ; GFX7-LABEL: flat_wavefront_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12004,10 +11221,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12020,10 +11233,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12047,8 +11256,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12060,8 +11267,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12145,9 +11350,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12159,10 +11361,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12175,10 +11373,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12202,8 +11396,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12215,8 +11407,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12300,9 +11490,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12315,10 +11502,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12332,10 +11515,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12361,8 +11540,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12375,8 +11552,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12467,9 +11642,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12482,10 +11654,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12499,10 +11667,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12528,8 +11692,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12542,8 +11704,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12634,9 +11794,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12649,10 +11806,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12666,10 +11819,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12695,8 +11844,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12709,8 +11856,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12801,9 +11946,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12816,10 +11958,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12833,10 +11971,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12862,8 +11996,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12876,8 +12008,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12968,9 +12098,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12983,10 +12110,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13000,10 +12123,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13029,8 +12148,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13043,8 +12160,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13135,9 +12250,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13153,10 +12265,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13173,10 +12281,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13208,8 +12312,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13224,8 +12326,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13335,9 +12435,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13353,10 +12450,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13373,10 +12466,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13408,8 +12497,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13424,8 +12511,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13535,9 +12620,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13553,10 +12635,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13573,10 +12651,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13608,8 +12682,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13624,8 +12696,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13735,9 +12805,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13763,10 +12830,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13792,10 +12855,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13846,8 +12905,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13863,8 +12920,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13981,9 +13036,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14009,10 +13061,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14038,10 +13086,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14092,8 +13136,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14109,8 +13151,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14227,9 +13267,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14255,10 +13292,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14284,10 +13317,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14338,8 +13367,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14355,8 +13382,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14473,9 +13498,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14501,10 +13523,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14530,10 +13548,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14584,8 +13598,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14601,8 +13613,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14719,9 +13729,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14747,10 +13754,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14776,10 +13779,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14830,8 +13829,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14847,8 +13844,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14965,9 +13960,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14993,10 +13985,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15022,10 +14010,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15076,8 +14060,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15093,8 +14075,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15211,9 +14191,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15239,10 +14216,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15268,10 +14241,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15322,8 +14291,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15339,8 +14306,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15457,9 +14422,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15485,10 +14447,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15514,10 +14472,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15568,8 +14522,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15585,8 +14537,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15703,9 +14653,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15731,10 +14678,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15760,10 +14703,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15814,8 +14753,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15831,8 +14768,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15949,9 +14884,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15977,10 +14909,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16006,10 +14934,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16060,8 +14984,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16077,8 +14999,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16195,9 +15115,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16223,10 +15140,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16252,10 +15165,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16306,8 +15215,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16323,8 +15230,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16441,9 +15346,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16469,10 +15371,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16498,10 +15396,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16552,8 +15446,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16569,8 +15461,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16687,9 +15577,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16715,10 +15602,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16744,10 +15627,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16798,8 +15677,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16815,8 +15692,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16933,9 +15808,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16961,10 +15833,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16990,10 +15858,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17044,8 +15908,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17061,8 +15923,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17179,9 +16039,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17207,10 +16064,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17236,10 +16089,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17290,8 +16139,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17307,8 +16154,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17425,9 +16270,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17456,10 +16298,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17488,10 +16326,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17548,8 +16382,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17568,8 +16400,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17713,9 +16543,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17744,10 +16571,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17776,10 +16599,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17836,8 +16655,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17856,8 +16673,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18001,9 +16816,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18032,10 +16844,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18064,10 +16872,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18124,8 +16928,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18144,8 +16946,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18289,9 +17089,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18320,10 +17117,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18352,10 +17145,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18412,8 +17201,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18432,8 +17219,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18577,9 +17362,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18608,10 +17390,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18640,10 +17418,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18700,8 +17474,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18720,8 +17492,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18865,9 +17635,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18896,10 +17663,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18928,10 +17691,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18988,8 +17747,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19008,8 +17765,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19153,9 +17908,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19184,10 +17936,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19216,10 +17964,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19276,8 +18020,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19296,8 +18038,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19441,9 +18181,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19472,10 +18209,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19504,10 +18237,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19564,8 +18293,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19584,8 +18311,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19729,9 +18454,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19760,10 +18482,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19792,10 +18510,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19852,8 +18566,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19872,8 +18584,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20017,9 +18727,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20048,10 +18755,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20080,10 +18783,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20140,8 +18839,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20160,8 +18857,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20305,9 +19000,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20336,10 +19028,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20368,10 +19056,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20428,8 +19112,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20448,8 +19130,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20593,9 +19273,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20624,10 +19301,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20656,10 +19329,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20716,8 +19385,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20736,8 +19403,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20881,9 +19546,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20912,10 +19574,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20944,10 +19602,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21004,8 +19658,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21024,8 +19676,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21169,9 +19819,6 @@ entry:
 define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21200,10 +19847,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21232,10 +19875,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21292,8 +19931,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21312,8 +19949,6 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 54baa032c3139c..8949e4b782f630 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -15,9 +15,6 @@
 define amdgpu_kernel void @flat_workgroup_unordered_load(
 ; GFX7-LABEL: flat_workgroup_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -33,10 +30,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -53,10 +46,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -88,8 +77,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -104,8 +91,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -215,9 +200,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ; GFX7-LABEL: flat_workgroup_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -233,10 +215,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -253,10 +231,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -288,8 +262,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -304,8 +276,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -415,9 +385,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_load(
 ; GFX7-LABEL: flat_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -434,10 +401,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -455,10 +418,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -492,8 +451,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -509,8 +466,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -629,9 +584,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ; GFX7-LABEL: flat_workgroup_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -649,10 +601,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -672,10 +620,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -711,8 +655,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -729,8 +671,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -860,9 +800,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_unordered_store(
 ; GFX7-LABEL: flat_workgroup_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -874,10 +811,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -890,10 +823,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -917,8 +846,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -930,8 +857,6 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1015,9 +940,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ; GFX7-LABEL: flat_workgroup_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1029,10 +951,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1045,10 +963,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1072,8 +986,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1085,8 +997,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1170,9 +1080,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_store(
 ; GFX7-LABEL: flat_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1185,10 +1092,6 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1203,10 +1106,6 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1232,8 +1131,6 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1246,8 +1143,6 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1343,9 +1238,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ; GFX7-LABEL: flat_workgroup_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1358,10 +1250,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1376,10 +1264,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1405,8 +1289,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1419,8 +1301,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -1516,9 +1396,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1531,10 +1408,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1548,10 +1421,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1577,8 +1446,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1591,8 +1458,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1683,9 +1548,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1699,10 +1561,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1719,10 +1577,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1750,8 +1604,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1765,8 +1617,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -1869,9 +1719,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1885,10 +1732,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -1904,10 +1747,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -1935,8 +1774,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -1950,8 +1787,6 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2054,9 +1889,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2071,10 +1903,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2093,10 +1921,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2126,8 +1950,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2142,8 +1964,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2258,9 +2078,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2275,10 +2092,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -2297,10 +2110,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -2330,8 +2139,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -2346,8 +2153,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -2462,9 +2267,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2481,10 +2283,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2502,10 +2300,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2539,8 +2333,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2556,8 +2348,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2676,9 +2466,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2696,10 +2483,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2719,10 +2502,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2758,8 +2537,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2776,8 +2553,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2908,9 +2683,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2928,10 +2700,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2951,10 +2719,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -2990,8 +2754,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3008,8 +2770,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -3140,9 +2900,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3168,10 +2925,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3197,10 +2950,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3251,8 +3000,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3268,8 +3015,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3386,9 +3131,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3415,10 +3157,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3447,10 +3185,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3503,8 +3237,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3521,8 +3253,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3651,9 +3381,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3680,10 +3407,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3711,10 +3434,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3767,8 +3486,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3785,8 +3502,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -3915,9 +3630,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3945,10 +3657,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -3979,10 +3687,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4037,8 +3741,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4056,8 +3758,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4198,9 +3898,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4228,10 +3925,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4262,10 +3955,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4320,8 +4009,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4339,8 +4026,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4481,9 +4166,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4510,10 +4192,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4542,10 +4220,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4598,8 +4272,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4616,8 +4288,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4746,9 +4416,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4775,10 +4442,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4807,10 +4470,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -4863,8 +4522,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -4881,8 +4538,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5011,9 +4666,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5041,10 +4693,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5075,10 +4723,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5133,8 +4777,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5152,8 +4794,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5294,9 +4934,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5324,10 +4961,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5358,10 +4991,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5416,8 +5045,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5435,8 +5062,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5577,9 +5202,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5607,10 +5229,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5641,10 +5259,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5699,8 +5313,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5718,8 +5330,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -5860,9 +5470,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5890,10 +5497,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5924,10 +5527,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -5982,8 +5581,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6001,8 +5598,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6143,9 +5738,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6174,10 +5766,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6206,10 +5794,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6266,8 +5850,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6286,8 +5868,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6431,9 +6011,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6463,10 +6040,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6496,10 +6069,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6558,8 +6127,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6579,8 +6146,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6733,9 +6298,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6765,10 +6327,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6799,10 +6357,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -6861,8 +6415,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -6882,8 +6434,6 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7039,9 +6589,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7072,10 +6619,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7107,10 +6650,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7171,8 +6710,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7193,8 +6730,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7359,9 +6894,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7392,10 +6924,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7427,10 +6955,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7491,8 +7015,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7513,8 +7035,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7679,9 +7199,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7711,10 +7228,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7744,10 +7257,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -7806,8 +7315,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7827,8 +7334,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -7981,9 +7486,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8013,10 +7515,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8046,10 +7544,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8108,8 +7602,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8129,8 +7621,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8283,9 +7773,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8316,10 +7803,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8351,10 +7834,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8415,8 +7894,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8437,8 +7914,6 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8603,9 +8078,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8636,10 +8108,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8671,10 +8139,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8735,8 +8199,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8757,8 +8219,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -8923,9 +8383,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8956,10 +8413,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -8991,10 +8444,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9055,8 +8504,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9077,8 +8524,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9243,9 +8688,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9276,10 +8718,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9311,10 +8749,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9375,8 +8809,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9397,8 +8829,6 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9563,9 +8993,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9596,10 +9023,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9631,10 +9054,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9695,8 +9114,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9717,8 +9134,6 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -9883,9 +9298,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9916,10 +9328,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -9951,10 +9359,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10015,8 +9419,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10037,8 +9439,6 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10203,9 +9603,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10236,10 +9633,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10271,10 +9664,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10335,8 +9724,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10357,8 +9744,6 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10523,9 +9908,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10556,10 +9938,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10591,10 +9969,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -10655,8 +10029,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10677,8 +10049,6 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -10843,9 +10213,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ; GFX7-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10861,10 +10228,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -10881,10 +10244,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -10916,8 +10275,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -10932,8 +10289,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11043,9 +10398,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11061,10 +10413,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11081,10 +10429,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11116,8 +10460,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11132,8 +10474,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11243,9 +10583,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11261,10 +10598,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11283,10 +10616,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11318,8 +10647,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11334,8 +10661,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11453,9 +10778,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11471,10 +10793,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -11495,10 +10813,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -11530,8 +10844,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -11546,8 +10858,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -11673,9 +10983,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ; GFX7-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11687,10 +10994,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11703,10 +11006,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11730,8 +11029,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11743,8 +11040,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11828,9 +11123,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11842,10 +11134,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11858,10 +11146,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11885,8 +11169,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11898,8 +11180,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -11983,9 +11263,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ; GFX7-LABEL: flat_workgroup_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11997,10 +11274,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12015,10 +11288,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12042,8 +11311,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12055,8 +11322,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12149,9 +11414,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12163,10 +11425,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12181,10 +11439,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12208,8 +11462,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12221,8 +11473,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x8
@@ -12315,9 +11565,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12330,10 +11577,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12347,10 +11590,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12376,8 +11615,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12390,8 +11627,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12482,9 +11717,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12497,10 +11729,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12516,10 +11744,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12545,8 +11769,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12559,8 +11781,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12659,9 +11879,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12674,10 +11891,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12693,10 +11906,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12722,8 +11931,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12736,8 +11943,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -12837,9 +12042,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12852,10 +12054,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -12873,10 +12071,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -12902,8 +12096,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -12916,8 +12108,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13025,9 +12215,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13040,10 +12227,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -13061,10 +12244,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -13090,8 +12269,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -13104,8 +12281,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -13213,9 +12388,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13231,10 +12403,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13253,10 +12421,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13288,8 +12452,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13304,8 +12466,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13423,9 +12583,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13441,10 +12598,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13465,10 +12618,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13500,8 +12649,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13516,8 +12663,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13644,9 +12789,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13662,10 +12804,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13686,10 +12824,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
 ; GFX10-CU-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13721,8 +12855,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13737,8 +12869,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[6:7], 0x8
@@ -13865,9 +12995,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13893,10 +13020,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13922,10 +13045,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -13976,8 +13095,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -13993,8 +13110,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14111,9 +13226,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14139,10 +13251,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14170,10 +13278,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14224,8 +13328,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14241,8 +13343,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14367,9 +13467,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14395,10 +13492,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14426,10 +13519,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14480,8 +13569,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14497,8 +13584,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14624,9 +13709,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14652,10 +13734,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14685,10 +13763,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14739,8 +13813,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14756,8 +13828,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -14891,9 +13961,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14919,10 +13986,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -14952,10 +14015,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15006,8 +14065,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15023,8 +14080,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15158,9 +14213,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15186,10 +14238,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15217,10 +14265,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15271,8 +14315,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15288,8 +14330,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15414,9 +14454,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15442,10 +14479,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15473,10 +14506,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15527,8 +14556,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15544,8 +14571,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15670,9 +14695,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15698,10 +14720,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15731,10 +14749,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15785,8 +14799,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15802,8 +14814,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -15937,9 +14947,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15965,10 +14972,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -15998,10 +15001,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16052,8 +15051,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16069,8 +15066,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16204,9 +15199,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16232,10 +15224,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16265,10 +15253,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16319,8 +15303,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16336,8 +15318,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16471,9 +15451,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16499,10 +15476,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16532,10 +15505,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16586,8 +15555,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16603,8 +15570,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16738,9 +15703,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16766,10 +15728,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16799,10 +15757,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -16853,8 +15807,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -16870,8 +15822,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17005,9 +15955,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17033,10 +15980,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17066,10 +16009,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17120,8 +16059,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17137,8 +16074,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17272,9 +16207,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17300,10 +16232,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17333,10 +16261,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17387,8 +16311,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17404,8 +16326,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17539,9 +16459,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17567,10 +16484,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17600,10 +16513,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s7, s[4:5], 0x8
@@ -17654,8 +16563,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17671,8 +16578,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17806,9 +16711,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17837,10 +16739,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17869,10 +16767,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -17929,8 +16823,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -17949,8 +16841,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18094,9 +16984,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18125,10 +17012,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18159,10 +17042,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18219,8 +17098,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18239,8 +17116,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18392,9 +17267,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18423,10 +17295,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18457,10 +17325,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18517,8 +17381,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18537,8 +17399,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18691,9 +17551,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18722,10 +17579,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18758,10 +17611,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -18818,8 +17667,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -18838,8 +17685,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19000,9 +17845,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19031,10 +17873,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19067,10 +17905,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19127,8 +17961,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19147,8 +17979,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19309,9 +18139,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19340,10 +18167,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19374,10 +18197,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19434,8 +18253,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19454,8 +18271,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19607,9 +18422,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19638,10 +18450,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19672,10 +18480,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19732,8 +18536,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19752,8 +18554,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -19905,9 +18705,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19936,10 +18733,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -19972,10 +18765,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20032,8 +18821,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20052,8 +18839,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20214,9 +18999,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20245,10 +19027,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20281,10 +19059,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20341,8 +19115,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20361,8 +19133,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20523,9 +19293,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20554,10 +19321,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20590,10 +19353,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20650,8 +19409,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20670,8 +19427,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20832,9 +19587,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20863,10 +19615,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20899,10 +19647,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -20959,8 +19703,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -20979,8 +19721,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21141,9 +19881,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21172,10 +19909,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21208,10 +19941,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21268,8 +19997,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21288,8 +20015,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21450,9 +20175,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21481,10 +20203,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21517,10 +20235,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21577,8 +20291,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21597,8 +20309,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21759,9 +20469,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21790,10 +20497,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21826,10 +20529,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -21886,8 +20585,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -21906,8 +20603,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22068,9 +20763,6 @@ entry:
 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22099,10 +20791,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-WGP-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-WGP-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-WGP-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22135,10 +20823,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s10, s10, s15
-; GFX10-CU-NEXT:    s_addc_u32 s11, s11, 0
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s10
-; GFX10-CU-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s11
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_load_dword s9, s[6:7], 0x8
 ; GFX10-CU-NEXT:    s_load_dword s8, s[6:7], 0xc
@@ -22195,8 +20879,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
@@ -22215,8 +20897,6 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-TGSPLIT-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[8:9], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s7, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index b20b4ab47a59bb..b56860991b1948 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -40,9 +40,6 @@ define amdgpu_kernel void @global_agent_unordered_load(
 ;
 ; GFX7-LABEL: global_agent_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -239,9 +236,6 @@ define amdgpu_kernel void @global_agent_monotonic_load(
 ;
 ; GFX7-LABEL: global_agent_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -439,9 +433,6 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ;
 ; GFX7-LABEL: global_agent_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -659,9 +650,6 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -884,9 +872,6 @@ define amdgpu_kernel void @global_agent_unordered_store(
 ;
 ; GFX7-LABEL: global_agent_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1054,9 +1039,6 @@ define amdgpu_kernel void @global_agent_monotonic_store(
 ;
 ; GFX7-LABEL: global_agent_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1225,9 +1207,6 @@ define amdgpu_kernel void @global_agent_release_store(
 ;
 ; GFX7-LABEL: global_agent_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1422,9 +1401,6 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1618,9 +1594,6 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1781,9 +1754,6 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1970,9 +1940,6 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2160,9 +2127,6 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2377,9 +2341,6 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2594,9 +2555,6 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2804,9 +2762,6 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3040,9 +2995,6 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3276,9 +3228,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3505,9 +3454,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3760,9 +3706,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4016,9 +3959,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4299,9 +4239,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4581,9 +4518,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4837,9 +4771,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5094,9 +5025,6 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5377,9 +5305,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5660,9 +5585,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5943,9 +5865,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6226,9 +6145,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6509,9 +6425,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6792,9 +6705,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7075,9 +6985,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -7358,9 +7265,6 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7617,9 +7521,6 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7896,9 +7797,6 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8182,9 +8080,6 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8488,9 +8383,6 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8793,9 +8685,6 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9072,9 +8961,6 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9352,9 +9238,6 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9658,9 +9541,6 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9964,9 +9844,6 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10270,9 +10147,6 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10576,9 +10450,6 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10882,9 +10753,6 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11188,9 +11056,6 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11494,9 +11359,6 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11798,9 +11660,6 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11997,9 +11856,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12197,9 +12053,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12417,9 +12270,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -12642,9 +12492,6 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12812,9 +12659,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12983,9 +12827,6 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13180,9 +13021,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13376,9 +13214,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13539,9 +13374,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13728,9 +13560,6 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13918,9 +13747,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14135,9 +13961,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -14352,9 +14175,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14562,9 +14382,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14798,9 +14615,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -15034,9 +14848,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15263,9 +15074,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15518,9 +15326,6 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15774,9 +15579,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16057,9 +15859,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16339,9 +16138,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16595,9 +16391,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16852,9 +16645,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17135,9 +16925,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17418,9 +17205,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17701,9 +17485,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17984,9 +17765,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18267,9 +18045,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18550,9 +18325,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -18833,9 +18605,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -19116,9 +18885,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19375,9 +19141,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19655,9 +19418,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19961,9 +19721,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20266,9 +20023,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20545,9 +20299,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20825,9 +20576,6 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21131,9 +20879,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21437,9 +21182,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21743,9 +21485,6 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22049,9 +21788,6 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22355,9 +22091,6 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22661,9 +22394,6 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22967,9 +22697,6 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index b97715e0ed8c95..9b2b3a4cfa9bae 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -35,9 +35,6 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ;
 ; GFX7-LABEL: global_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -243,9 +240,6 @@ define amdgpu_kernel void @global_nontemporal_load_1(
 ;
 ; GFX7-LABEL: global_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -504,9 +498,6 @@ define amdgpu_kernel void @global_nontemporal_store_0(
 ;
 ; GFX7-LABEL: global_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -707,9 +698,6 @@ define amdgpu_kernel void @global_nontemporal_store_1(
 ;
 ; GFX7-LABEL: global_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -957,9 +945,6 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
 ;
 ; GFX7-LABEL: global_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index 1d7ba89e339597..afc46fbc23a67a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -40,9 +40,6 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
 ;
 ; GFX7-LABEL: global_singlethread_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -239,9 +236,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -438,9 +432,6 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -637,9 +628,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -829,9 +817,6 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
 ;
 ; GFX7-LABEL: global_singlethread_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -999,9 +984,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1169,9 +1151,6 @@ define amdgpu_kernel void @global_singlethread_release_store(
 ;
 ; GFX7-LABEL: global_singlethread_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1339,9 +1318,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1509,9 +1485,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1670,9 +1643,6 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1831,9 +1801,6 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1992,9 +1959,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2153,9 +2117,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2316,9 +2277,6 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2505,9 +2463,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2694,9 +2649,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2885,9 +2837,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3112,9 +3061,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3339,9 +3285,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3566,9 +3509,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3793,9 +3733,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4020,9 +3957,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4247,9 +4181,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4474,9 +4405,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4701,9 +4629,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4928,9 +4853,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5155,9 +5077,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5382,9 +5301,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5609,9 +5525,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5836,9 +5749,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6063,9 +5973,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6293,9 +6200,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6551,9 +6455,6 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6809,9 +6710,6 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7067,9 +6965,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7325,9 +7220,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7583,9 +7475,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7841,9 +7730,6 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8099,9 +7985,6 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8357,9 +8240,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8615,9 +8495,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8873,9 +8750,6 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9131,9 +9005,6 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9389,9 +9260,6 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9647,9 +9515,6 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9905,9 +9770,6 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10163,9 +10025,6 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10362,9 +10221,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10561,9 +10417,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10760,9 +10613,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10952,9 +10802,6 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11122,9 +10969,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11292,9 +11136,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11462,9 +11303,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11632,9 +11470,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11793,9 +11628,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11954,9 +11786,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12115,9 +11944,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12276,9 +12102,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12439,9 +12262,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12628,9 +12448,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12817,9 +12634,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13008,9 +12822,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13235,9 +13046,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13462,9 +13270,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13689,9 +13494,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13916,9 +13718,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14143,9 +13942,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14370,9 +14166,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14597,9 +14390,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14824,9 +14614,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15051,9 +14838,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15278,9 +15062,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15505,9 +15286,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15732,9 +15510,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15959,9 +15734,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16186,9 +15958,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16416,9 +16185,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16674,9 +16440,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16932,9 +16695,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17190,9 +16950,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17448,9 +17205,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17706,9 +17460,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17964,9 +17715,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18222,9 +17970,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18480,9 +18225,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18738,9 +18480,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18996,9 +18735,6 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
 ;
 ; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19254,9 +18990,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19512,9 +19245,6 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19770,9 +19500,6 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20028,9 +19755,6 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
 ;
 ; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index c1711040bf4014..62a4f3b43b2dcd 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -40,9 +40,6 @@ define amdgpu_kernel void @global_system_unordered_load(
 ;
 ; GFX7-LABEL: global_system_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -239,9 +236,6 @@ define amdgpu_kernel void @global_system_monotonic_load(
 ;
 ; GFX7-LABEL: global_system_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -439,9 +433,6 @@ define amdgpu_kernel void @global_system_acquire_load(
 ;
 ; GFX7-LABEL: global_system_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -661,9 +652,6 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ;
 ; GFX7-LABEL: global_system_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -888,9 +876,6 @@ define amdgpu_kernel void @global_system_unordered_store(
 ;
 ; GFX7-LABEL: global_system_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1058,9 +1043,6 @@ define amdgpu_kernel void @global_system_monotonic_store(
 ;
 ; GFX7-LABEL: global_system_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1229,9 +1211,6 @@ define amdgpu_kernel void @global_system_release_store(
 ;
 ; GFX7-LABEL: global_system_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1428,9 +1407,6 @@ define amdgpu_kernel void @global_system_seq_cst_store(
 ;
 ; GFX7-LABEL: global_system_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1626,9 +1602,6 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1789,9 +1762,6 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1980,9 +1950,6 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2172,9 +2139,6 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2393,9 +2357,6 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2614,9 +2575,6 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2826,9 +2784,6 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3066,9 +3021,6 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3306,9 +3258,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3535,9 +3484,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3792,9 +3738,6 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4050,9 +3993,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4337,9 +4277,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4623,9 +4560,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4881,9 +4815,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5140,9 +5071,6 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5427,9 +5355,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5714,9 +5639,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6001,9 +5923,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6288,9 +6207,6 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6547,9 +6463,6 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6829,9 +6742,6 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7139,9 +7049,6 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7448,9 +7355,6 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7729,9 +7633,6 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8011,9 +7912,6 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8321,9 +8219,6 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8631,9 +8526,6 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8941,9 +8833,6 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9251,9 +9140,6 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9561,9 +9447,6 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9871,9 +9754,6 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10181,9 +10061,6 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10489,9 +10366,6 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_system_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10688,9 +10562,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10888,9 +10759,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11110,9 +10978,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11337,9 +11202,6 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_system_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11507,9 +11369,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11678,9 +11537,6 @@ define amdgpu_kernel void @global_system_one_as_release_store(
 ;
 ; GFX7-LABEL: global_system_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11877,9 +11733,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12075,9 +11928,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12238,9 +12088,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12429,9 +12276,6 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12621,9 +12465,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12842,9 +12683,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13063,9 +12901,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13275,9 +13110,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13515,9 +13347,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13755,9 +13584,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13984,9 +13810,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14241,9 +14064,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14499,9 +14319,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14786,9 +14603,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15072,9 +14886,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15330,9 +15141,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15589,9 +15397,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15876,9 +15681,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16163,9 +15965,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16450,9 +16249,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16737,9 +16533,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17024,9 +16817,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17311,9 +17101,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17598,9 +17385,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17885,9 +17669,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18144,9 +17925,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18425,9 +18203,6 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18713,9 +18488,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19023,9 +18795,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19332,9 +19101,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19613,9 +19379,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19895,9 +19658,6 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20205,9 +19965,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20515,9 +20272,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20825,9 +20579,6 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21135,9 +20886,6 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21445,9 +21193,6 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21755,9 +21500,6 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -22065,9 +21807,6 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 5581d76f539746..a98efb49b4b72b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -36,9 +36,6 @@ define amdgpu_kernel void @global_volatile_load_0(
 ;
 ; GFX7-LABEL: global_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -193,9 +190,6 @@ define amdgpu_kernel void @global_volatile_load_1(
 ;
 ; GFX7-LABEL: global_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x2
 ; GFX7-NEXT:    s_mov_b32 s6, 2
@@ -387,9 +381,6 @@ define amdgpu_kernel void @global_volatile_store_0(
 ;
 ; GFX7-LABEL: global_volatile_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -553,9 +544,6 @@ define amdgpu_kernel void @global_volatile_store_1(
 ;
 ; GFX7-LABEL: global_volatile_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -753,9 +741,6 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
 ;
 ; GFX7-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -899,9 +884,6 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ;
 ; GFX7-LABEL: global_volatile_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index 3c1e0e1d3d9afc..f805e2cf37006c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -40,9 +40,6 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
 ;
 ; GFX7-LABEL: global_wavefront_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -239,9 +236,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -438,9 +432,6 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -637,9 +628,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -829,9 +817,6 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
 ;
 ; GFX7-LABEL: global_wavefront_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -999,9 +984,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1169,9 +1151,6 @@ define amdgpu_kernel void @global_wavefront_release_store(
 ;
 ; GFX7-LABEL: global_wavefront_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1339,9 +1318,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1509,9 +1485,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1670,9 +1643,6 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1831,9 +1801,6 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1992,9 +1959,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2153,9 +2117,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2316,9 +2277,6 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2505,9 +2463,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2694,9 +2649,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2885,9 +2837,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3112,9 +3061,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3339,9 +3285,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3566,9 +3509,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3793,9 +3733,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4020,9 +3957,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4247,9 +4181,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4474,9 +4405,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4701,9 +4629,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4928,9 +4853,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5155,9 +5077,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5382,9 +5301,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5609,9 +5525,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5836,9 +5749,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6063,9 +5973,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6293,9 +6200,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6551,9 +6455,6 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -6809,9 +6710,6 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7067,9 +6965,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7325,9 +7220,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7583,9 +7475,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7841,9 +7730,6 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8099,9 +7985,6 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8357,9 +8240,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8615,9 +8495,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8873,9 +8750,6 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9131,9 +9005,6 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9389,9 +9260,6 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9647,9 +9515,6 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9905,9 +9770,6 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10163,9 +10025,6 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10362,9 +10221,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10561,9 +10417,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10760,9 +10613,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -10952,9 +10802,6 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11122,9 +10969,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11292,9 +11136,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11462,9 +11303,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11632,9 +11470,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11793,9 +11628,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -11954,9 +11786,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12115,9 +11944,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12276,9 +12102,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12439,9 +12262,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12628,9 +12448,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12817,9 +12634,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13008,9 +12822,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13235,9 +13046,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13462,9 +13270,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13689,9 +13494,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -13916,9 +13718,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14143,9 +13942,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14370,9 +14166,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14597,9 +14390,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14824,9 +14614,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15051,9 +14838,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15278,9 +15062,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15505,9 +15286,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15732,9 +15510,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15959,9 +15734,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16186,9 +15958,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16416,9 +16185,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16674,9 +16440,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -16932,9 +16695,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17190,9 +16950,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17448,9 +17205,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17706,9 +17460,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17964,9 +17715,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18222,9 +17970,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18480,9 +18225,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18738,9 +18480,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18996,9 +18735,6 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19254,9 +18990,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19512,9 +19245,6 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19770,9 +19500,6 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20028,9 +19755,6 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 8d4b24de61deb4..30bf4920715352 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -40,9 +40,6 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
 ;
 ; GFX7-LABEL: global_workgroup_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -239,9 +236,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -438,9 +432,6 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -645,9 +636,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -853,9 +841,6 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
 ;
 ; GFX7-LABEL: global_workgroup_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1023,9 +1008,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1194,9 +1176,6 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ;
 ; GFX7-LABEL: global_workgroup_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1383,9 +1362,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1571,9 +1547,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1732,9 +1705,6 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -1904,9 +1874,6 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2084,9 +2051,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2274,9 +2238,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -2465,9 +2426,6 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2662,9 +2620,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2877,9 +2832,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3093,9 +3045,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3320,9 +3269,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3558,9 +3504,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -3804,9 +3747,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4060,9 +4000,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4315,9 +4252,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4552,9 +4486,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -4790,9 +4721,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5046,9 +4974,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5302,9 +5227,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5558,9 +5480,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -5814,9 +5733,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6070,9 +5986,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6326,9 +6239,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6582,9 +6492,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -6840,9 +6747,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7098,9 +7002,6 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7364,9 +7265,6 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7641,9 +7539,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -7925,9 +7820,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8208,9 +8100,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8473,9 +8362,6 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -8739,9 +8625,6 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9023,9 +8906,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9307,9 +9187,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9591,9 +9468,6 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -9875,9 +9749,6 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10159,9 +10030,6 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10443,9 +10311,6 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -10727,9 +10592,6 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -11010,9 +10872,6 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11209,9 +11068,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11408,9 +11264,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11614,9 +11467,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -11819,9 +11669,6 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -11989,9 +11836,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12159,9 +12003,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12340,9 +12181,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dword s4, s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -12521,9 +12359,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12682,9 +12517,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -12853,9 +12685,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13025,9 +12854,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13207,9 +13033,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x2
@@ -13391,9 +13214,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13587,9 +13407,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -13794,9 +13611,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s6, s[6:7], 0x2
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
@@ -14003,9 +13817,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14230,9 +14041,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14467,9 +14275,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14705,9 +14510,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -14953,9 +14755,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15201,9 +15000,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15438,9 +15234,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15675,9 +15468,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -15923,9 +15713,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16171,9 +15958,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16419,9 +16203,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16667,9 +16448,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -16915,9 +16693,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17163,9 +16938,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17411,9 +17183,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dword s7, s[4:5], 0x2
@@ -17662,9 +17431,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -17920,9 +17686,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18185,9 +17948,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18454,9 +18214,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -18730,9 +18487,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19006,9 +18760,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19271,9 +19022,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19536,9 +19284,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -19812,9 +19557,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20088,9 +19830,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20364,9 +20103,6 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ;
 ; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20640,9 +20376,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -20916,9 +20649,6 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21192,9 +20922,6 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
@@ -21468,9 +21195,6 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX7-NEXT:    s_load_dword s9, s[6:7], 0x2
 ; GFX7-NEXT:    s_load_dword s8, s[6:7], 0x3
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index 403e3012baa222..ba9711333a1946 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -37,9 +37,6 @@ define amdgpu_kernel void @local_nontemporal_load_0(
 ;
 ; GFX7-LABEL: local_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -241,9 +238,6 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ;
 ; GFX7-LABEL: local_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -887,9 +881,6 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
 ;
 ; GFX7-LABEL: local_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 454b11e3e67251..a52dd9b3401696 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -33,9 +33,6 @@ define amdgpu_kernel void @local_volatile_load_0(
 ;
 ; GFX7-LABEL: local_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -181,9 +178,6 @@ define amdgpu_kernel void @local_volatile_load_1(
 ;
 ; GFX7-LABEL: local_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 9166046b55fca3..4e08065e879fd8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -37,10 +37,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX7-LABEL: private_nontemporal_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -56,7 +53,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -72,7 +69,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -113,7 +110,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -129,7 +126,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -249,10 +246,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX7-LABEL: private_nontemporal_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -270,7 +264,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
@@ -288,7 +282,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
@@ -333,7 +327,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
@@ -354,7 +348,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    v_mov_b32_e32 v1, v0
@@ -505,7 +499,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX7-LABEL: private_nontemporal_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -520,7 +514,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -536,7 +530,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -571,7 +565,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -587,7 +581,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -701,7 +695,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX7-LABEL: private_nontemporal_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -718,7 +712,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -734,7 +728,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
@@ -771,7 +765,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_nop 0
@@ -790,7 +784,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX90A-TGSPLIT-NEXT:    s_nop 0
@@ -935,10 +929,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX7-LABEL: private_nontemporal_volatile_load:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -954,7 +945,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX10-WGP-LABEL: private_nontemporal_volatile_load:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -970,7 +961,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX10-CU-LABEL: private_nontemporal_volatile_load:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1011,7 +1002,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
 ; GFX90A-NOTTGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-NOTTGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-NOTTGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-NOTTGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -1027,7 +1018,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load:
 ; GFX90A-TGSPLIT:       ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT:    s_add_u32 s0, s0, s13
 ; GFX90A-TGSPLIT-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX90A-TGSPLIT-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX90A-TGSPLIT-NEXT:    s_load_dword s6, s[4:5], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index ddee2f39c4ba16..a68b5f36b806ed 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -37,10 +37,7 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX7-LABEL: private_volatile_load_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -56,7 +53,7 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX10-WGP-LABEL: private_volatile_load_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -72,7 +69,7 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX10-CU-LABEL: private_volatile_load_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -199,10 +196,7 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX7-LABEL: private_volatile_load_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX7-NEXT:    s_add_i32 s10, s10, s15
-; GFX7-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dword s6, s[4:5], 0x0
@@ -220,7 +214,7 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX10-WGP-LABEL: private_volatile_load_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    v_mov_b32_e32 v1, v0
@@ -238,7 +232,7 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX10-CU-LABEL: private_volatile_load_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    v_mov_b32_e32 v1, v0
@@ -384,7 +378,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX7-LABEL: private_volatile_store_0:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -400,7 +394,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX10-WGP-LABEL: private_volatile_store_0:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -417,7 +411,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX10-CU-LABEL: private_volatile_store_0:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -545,7 +539,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX7-LABEL: private_volatile_store_1:
 ; GFX7:       ; %bb.0: ; %entry
-; GFX7-NEXT:    s_add_u32 s0, s0, s15
+; GFX7-NEXT:    s_add_u32 s0, s0, s13
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX7-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
@@ -563,7 +557,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX10-WGP-LABEL: private_volatile_store_1:
 ; GFX10-WGP:       ; %bb.0: ; %entry
-; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-WGP-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-WGP-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-WGP-NEXT:    s_nop 0
@@ -580,7 +574,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX10-CU-LABEL: private_volatile_store_1:
 ; GFX10-CU:       ; %bb.0: ; %entry
-; GFX10-CU-NEXT:    s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT:    s_add_u32 s0, s0, s13
 ; GFX10-CU-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-CU-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
 ; GFX10-CU-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index bbb70d53b0dd4b..686797f290b97f 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -34,13 +34,10 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -59,13 +56,10 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -152,9 +146,6 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_imin_sle_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -166,9 +157,6 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_imin_sle_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -230,9 +218,6 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
 ; CI-LABEL: s_test_imin_sle_v1i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -244,9 +229,6 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
 ; VI-LABEL: s_test_imin_sle_v1i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -310,9 +292,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
 ;
 ; CI-LABEL: s_test_imin_sle_v4i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -331,9 +310,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
 ;
 ; VI-LABEL: s_test_imin_sle_v4i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -446,14 +422,11 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i8 s2, s2
 ; CI-NEXT:    s_sext_i32_i8 s3, s3
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_byte v[0:1], v2
@@ -464,14 +437,11 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_sext_i32_i8 s2, s2
 ; VI-NEXT:    s_sext_i32_i8 s3, s3
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_byte v[0:1], v2
@@ -589,8 +559,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_ashr_i32 s4, s2, 24
 ; CI-NEXT:    s_sext_i32_i8 s5, s2
@@ -614,7 +582,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; CI-NEXT:    s_and_b32 s3, s3, 0xffff
 ; CI-NEXT:    s_or_b32 s2, s3, s2
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -625,8 +592,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_lshrrev_b16_e64 v0, 8, s2
 ; VI-NEXT:    v_lshrrev_b16_e64 v1, 8, s3
@@ -647,7 +612,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ; VI-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -789,9 +753,6 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 ; CI-LABEL: s_test_imin_sle_v2i16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_ashr_i32 s4, s2, 16
 ; CI-NEXT:    s_sext_i32_i16 s2, s2
@@ -811,9 +772,6 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 ; VI-LABEL: s_test_imin_sle_v2i16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_ashr_i32 s4, s2, 16
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
@@ -948,9 +906,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_ashr_i32 s6, s0, 16
 ; CI-NEXT:    s_ashr_i32 s7, s1, 16
@@ -981,9 +936,6 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_ashr_i32 s6, s1, 16
 ; VI-NEXT:    s_sext_i32_i16 s1, s1
@@ -1082,13 +1034,10 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1107,13 +1056,10 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1229,13 +1175,10 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1254,13 +1197,10 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 1, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1348,9 +1288,6 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_imin_slt_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1362,9 +1299,6 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_imin_slt_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1428,9 +1362,6 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s1, s1, s3
 ; CI-NEXT:    s_min_i32 s0, s0, s2
@@ -1445,9 +1376,6 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s1, s1, s3
 ; VI-NEXT:    s_min_i32 s0, s0, s2
@@ -1522,9 +1450,6 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1537,9 +1462,6 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1607,9 +1529,6 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_i32 s2, s2, 8
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1622,9 +1541,6 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_i32 s2, s2, 8
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1704,13 +1620,10 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1729,13 +1642,10 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1841,15 +1751,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v6
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v2, s5
 ; CI-NEXT:    v_add_i32_e32 v3, vcc, s4, v6
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; CI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
 ; CI-NEXT:    flat_load_dwordx3 v[3:5], v[3:4]
@@ -1868,15 +1775,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v6, 4, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v6
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v2, s5
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, s4, v6
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; VI-NEXT:    flat_load_dwordx3 v[0:2], v[0:1]
 ; VI-NEXT:    flat_load_dwordx3 v[3:5], v[3:4]
@@ -2010,15 +1914,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; CI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
@@ -2049,15 +1950,12 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
 ; VI-NEXT:    flat_load_dwordx2 v[2:3], v[2:3]
@@ -2156,9 +2054,6 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_umin_ule_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2170,9 +2065,6 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_umin_ule_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2247,13 +2139,10 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v4
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v4
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2272,13 +2161,10 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2384,9 +2270,6 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v2, s3
 ; CI-NEXT:    v_add_i32_e32 v1, vcc, s2, v0
@@ -2408,9 +2291,6 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v2, s3
 ; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v0
@@ -2498,9 +2378,6 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
 ; CI-LABEL: s_test_umin_ult_i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2512,9 +2389,6 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
 ; VI-LABEL: s_test_umin_ult_i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2598,9 +2472,6 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ; CI-LABEL: v_test_umin_ult_i32_multi_use:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s5, s[6:7], 0x0
@@ -2622,9 +2493,6 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ; VI-LABEL: v_test_umin_ult_i32_multi_use:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s5, s[6:7], 0x0
@@ -2754,9 +2622,6 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; CI-LABEL: v_test_umin_ult_i16_multi_use:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s4
 ; CI-NEXT:    v_mov_b32_e32 v1, s5
@@ -2779,9 +2644,6 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ; VI-LABEL: v_test_umin_ult_i16_multi_use:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
@@ -2874,9 +2736,6 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
 ; CI-LABEL: s_test_umin_ult_v1i32:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2888,9 +2747,6 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
 ; VI-LABEL: s_test_umin_ult_v1i32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -2962,9 +2818,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
 ;
 ; CI-LABEL: s_test_umin_ult_v8i32:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx16 s[8:23], s[6:7], 0x8
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2996,9 +2849,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
 ;
 ; VI-LABEL: s_test_umin_ult_v8i32:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx16 s[8:23], s[6:7], 0x20
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3259,9 +3109,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
 ;
 ; CI-LABEL: s_test_umin_ult_v8i16:
 ; CI:       ; %bb.0:
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3308,9 +3155,6 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
 ;
 ; VI-LABEL: s_test_umin_ult_v8i16:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3436,14 +3280,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; CI-NEXT:    s_and_b32 s3, s3, 0xffff
 ; CI-NEXT:    s_min_u32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -3454,14 +3295,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    s_and_b32 s3, s3, 0xffff
 ; VI-NEXT:    s_min_u32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -3551,14 +3389,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0xa
 ; CI-NEXT:    s_load_dword s3, s[6:7], 0x13
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i16 s2, s2
 ; CI-NEXT:    s_sext_i32_i16 s3, s3
 ; CI-NEXT:    s_min_i32 s2, s2, s3
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s2
 ; CI-NEXT:    flat_store_dword v[0:1], v2
@@ -3569,14 +3404,11 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x28
 ; VI-NEXT:    s_load_dword s3, s[6:7], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
 ; VI-NEXT:    s_sext_i32_i16 s3, s3
 ; VI-NEXT:    s_min_i32 s2, s2, s3
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_dword v[0:1], v2
@@ -3673,9 +3505,6 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i16 s3, s2
 ; CI-NEXT:    s_ashr_i32 s2, s2, 16
@@ -3690,9 +3519,6 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_sext_i32_i16 s3, s2
 ; VI-NEXT:    s_ashr_i32 s2, s2, 16
@@ -3775,9 +3601,6 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3796,9 +3619,6 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3891,9 +3711,6 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -3912,9 +3729,6 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -4007,9 +3821,6 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -4028,9 +3839,6 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -4123,9 +3931,6 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s4
@@ -4144,9 +3949,6 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s4
@@ -4263,12 +4065,9 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    flat_load_dword v4, v[0:1]
@@ -4297,13 +4096,10 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -4421,12 +4217,9 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x4
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    v_mov_b32_e32 v3, s5
 ; CI-NEXT:    flat_load_dword v4, v[0:1]
@@ -4454,13 +4247,10 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x10
 ; VI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v4
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v4
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
index d7814c52828b88..5792fab7011afe 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -180,9 +180,6 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -263,9 +260,6 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -347,9 +341,6 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -412,9 +403,6 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -477,9 +465,6 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 {
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -542,9 +527,6 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0)
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -606,9 +588,6 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX8-NEXT:    s_add_i32 s10, s10, s15
-; GFX8-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX8-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
index 3de6945f95556b..529e64715500dd 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -176,9 +176,6 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1)
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -257,9 +254,6 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
@@ -339,9 +333,6 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 {
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -402,9 +393,6 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -466,9 +454,6 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 {
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
@@ -529,9 +514,6 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0
 ; GFX803:       ; %bb.0:
 ; GFX803-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GFX803-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX803-NEXT:    s_add_i32 s10, s10, s15
-; GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX803-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index 176f04fb03506a..58b61510c24e8b 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -10,36 +10,36 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX908: bb.0 (%ir-block.0):
   ; REGALLOC-GFX908-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX908-NEXT: {{  $}}
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %27
-  ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %27
-  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %24
-  ; REGALLOC-GFX908-NEXT:   SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def %26
+  ; REGALLOC-GFX908-NEXT:   [[COPY:%[0-9]+]]:av_128 = COPY %26
+  ; REGALLOC-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %23
+  ; REGALLOC-GFX908-NEXT:   SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; REGALLOC-GFX908-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[COPY]]
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; REGALLOC-GFX908-NEXT:   [[COPY2:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
   ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; REGALLOC-GFX908-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
   ; REGALLOC-GFX908-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY2]], 0, 0, 0, implicit $mode, implicit $exec
   ; REGALLOC-GFX908-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64, [[SI_SPILL_V64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
-  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX908-NEXT:   S_ENDPGM 0
   ;
   ; PEI-GFX908-LABEL: name: partial_copy
   ; PEI-GFX908: bb.0 (%ir-block.0):
-  ; PEI-GFX908-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
+  ; PEI-GFX908-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
   ; PEI-GFX908-NEXT: {{  $}}
-  ; PEI-GFX908-NEXT:   $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
-  ; PEI-GFX908-NEXT:   $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
-  ; PEI-GFX908-NEXT:   $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+  ; PEI-GFX908-NEXT:   $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+  ; PEI-GFX908-NEXT:   $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+  ; PEI-GFX908-NEXT:   $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
   ; PEI-GFX908-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
   ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
   ; PEI-GFX908-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
-  ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+  ; PEI-GFX908-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
   ; PEI-GFX908-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
@@ -48,7 +48,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX908-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
   ; PEI-GFX908-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
   ; PEI-GFX908-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-  ; PEI-GFX908-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+  ; PEI-GFX908-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX908-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
   ; PEI-GFX908-NEXT:   GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX908-NEXT:   renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -59,34 +59,34 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; REGALLOC-GFX90A: bb.0 (%ir-block.0):
   ; REGALLOC-GFX90A-NEXT:   liveins: $sgpr4_sgpr5
   ; REGALLOC-GFX90A-NEXT: {{  $}}
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %26
-  ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %26
-  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %24
-  ; REGALLOC-GFX90A-NEXT:   SI_SPILL_V64_SAVE %24, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def %25
+  ; REGALLOC-GFX90A-NEXT:   [[COPY:%[0-9]+]]:av_128_align2 = COPY %25
+  ; REGALLOC-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %23
+  ; REGALLOC-GFX90A-NEXT:   SI_SPILL_V64_SAVE %23, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, [[COPY]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
   ; REGALLOC-GFX90A-NEXT:   [[COPY1:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
   ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; REGALLOC-GFX90A-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
   ; REGALLOC-GFX90A-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
   ; REGALLOC-GFX90A-NEXT:   [[SI_SPILL_AV64_RESTORE:%[0-9]+]]:av_64_align2 = SI_SPILL_AV64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
-  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, [[SI_SPILL_AV64_RESTORE]], 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
+  ; REGALLOC-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; REGALLOC-GFX90A-NEXT:   S_ENDPGM 0
   ;
   ; PEI-GFX90A-LABEL: name: partial_copy
   ; PEI-GFX90A: bb.0 (%ir-block.0):
-  ; PEI-GFX90A-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
+  ; PEI-GFX90A-NEXT:   liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
   ; PEI-GFX90A-NEXT: {{  $}}
-  ; PEI-GFX90A-NEXT:   $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
-  ; PEI-GFX90A-NEXT:   $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
-  ; PEI-GFX90A-NEXT:   $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+  ; PEI-GFX90A-NEXT:   $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+  ; PEI-GFX90A-NEXT:   $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+  ; PEI-GFX90A-NEXT:   $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
   ; PEI-GFX90A-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
   ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6553610 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
   ; PEI-GFX90A-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
-  ; PEI-GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+  ; PEI-GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
   ; PEI-GFX90A-NEXT:   $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX90A-NEXT:   renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
@@ -94,7 +94,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
   ; PEI-GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
   ; PEI-GFX90A-NEXT:   renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
-  ; PEI-GFX90A-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+  ; PEI-GFX90A-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
   ; PEI-GFX90A-NEXT:   $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
   ; PEI-GFX90A-NEXT:   GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 847c2d343d4153..c2132cf907fdb2 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -6,9 +6,6 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b,
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s2
@@ -36,12 +33,9 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x5a
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_sad_u32 v2, s2, v0, 20
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
@@ -63,9 +57,6 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b,
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s2
@@ -92,9 +83,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s16, s16, s15
+; GCN-NEXT:    s_add_u32 s16, s16, s13
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s3, s0, s1
@@ -103,7 +92,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    s_add_i32 s0, s0, s2
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -131,15 +119,12 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s16, s16, s15
+; GCN-NEXT:    s_add_u32 s16, s16, s13
+; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NEXT:    v_mov_b32_e32 v3, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    v_sad_u32 v2, s0, v2, v3
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
@@ -166,9 +151,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s16, s16, s15
+; GCN-NEXT:    s_add_u32 s16, s16, s13
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_max_u32 s3, s0, s1
@@ -179,7 +162,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_store_dword v[0:1], v3
 ; GCN-NEXT:    s_endpgm
@@ -204,9 +186,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s16, s16, s15
+; GCN-NEXT:    s_add_u32 s16, s16, s13
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s3, s0, s1
@@ -217,7 +197,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_store_dword v[0:1], v3
 ; GCN-NEXT:    s_endpgm
@@ -243,9 +222,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s16, s16, s15
+; GCN-NEXT:    s_add_u32 s16, s16, s13
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s3, s0, s1
@@ -256,7 +233,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    flat_store_dword v[0:1], v3
 ; GCN-NEXT:    s_endpgm
@@ -279,9 +255,7 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
 ; GCN-NEXT:    s_mov_b64 s[16:17], s[0:1]
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_add_u32 s16, s16, s15
+; GCN-NEXT:    s_add_u32 s16, s16, s13
 ; GCN-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_min_u32 s3, s0, s1
@@ -290,7 +264,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
 ; GCN-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    s_add_i32 s0, s0, s2
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    buffer_store_dword v2, v0, s[16:19], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -312,9 +285,6 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
 define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; GCN-LABEL: v_sad_u32_vector_pat1:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0xc
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
@@ -351,9 +321,6 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32
 define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
 ; GCN-LABEL: v_sad_u32_vector_pat2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x4
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0xc
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
@@ -391,8 +358,6 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
 ; GCN-NEXT:    s_load_dword s4, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[2:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s4, s4, 0xffff
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 16
@@ -400,7 +365,6 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    v_sad_u32 v2, s4, v1, v0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-NEXT:    flat_store_short v[0:1], v2
 ; GCN-NEXT:    s_endpgm
@@ -420,9 +384,6 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
 define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) {
 ; GCN-LABEL: v_sad_u32_i16_pat2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    flat_load_ushort v0, v[0:1] glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -455,9 +416,6 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s3, s2, 0xff
 ; GCN-NEXT:    s_bfe_u32 s4, s2, 0x80008
@@ -485,9 +443,6 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
 define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) {
 ; GCN-LABEL: v_sad_u32_i8_pat2:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    flat_load_ubyte v0, v[0:1] glc
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
@@ -520,9 +475,6 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s3, s2, 0xff
 ; GCN-NEXT:    s_bfe_u32 s4, s2, 0x80008
@@ -550,9 +502,6 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_max_u32 s6, s0, s1
 ; GCN-NEXT:    s_cmp_le_u32 s0, s1
@@ -582,9 +531,6 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x2
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s3, s0, s3
 ; GCN-NEXT:    s_sub_i32 s6, s1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index 9826585df8bd80..89a09dc4fcc171 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -9,8 +9,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -26,8 +24,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; GFX906-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX906-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX906-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -43,8 +39,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX908:       ; %bb.0: ; %entry
 ; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; GFX908-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX908-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -61,8 +55,6 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
-; GFX90A-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -96,8 +88,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX900:       ; %bb.0: ; %entry
 ; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; GFX900-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX900-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -113,8 +103,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; GFX906-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX906-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX906-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -130,8 +118,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX908:       ; %bb.0: ; %entry
 ; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
-; GFX908-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX908-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX908-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
@@ -148,8 +134,6 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
 ; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; GFX90A-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 4, v4
-; GFX90A-NEXT:    s_add_u32 flat_scratch_lo, s10, s15
-; GFX90A-NEXT:    s_addc_u32 flat_scratch_hi, s11, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
index 597f2d875cac47..ad82869c001f6f 100644
--- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
@@ -26,179 +26,175 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    s_load_dword s0, s[6:7], 0x8
 ; CHECK-NEXT:    v_writelane_b32 v23, s3, 1
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[48:51]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[4:11]
+; CHECK-NEXT:    ; def s[4:7]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_writelane_b32 v23, s4, 2
 ; CHECK-NEXT:    v_writelane_b32 v23, s5, 3
 ; CHECK-NEXT:    v_writelane_b32 v23, s6, 4
 ; CHECK-NEXT:    v_writelane_b32 v23, s7, 5
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 6
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 7
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 8
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 9
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s[4:11]
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 6
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 7
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 8
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 9
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 10
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 11
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 12
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 13
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:19]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 10
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 11
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 12
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 13
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 14
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 15
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 16
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 17
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 18
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 19
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 20
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 21
-; CHECK-NEXT:    v_writelane_b32 v23, s16, 22
-; CHECK-NEXT:    v_writelane_b32 v23, s17, 23
-; CHECK-NEXT:    v_writelane_b32 v23, s18, 24
-; CHECK-NEXT:    v_writelane_b32 v23, s19, 25
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 14
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 15
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 16
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 17
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 18
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 19
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 20
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 21
+; CHECK-NEXT:    v_writelane_b32 v23, s12, 22
+; CHECK-NEXT:    v_writelane_b32 v23, s13, 23
+; CHECK-NEXT:    v_writelane_b32 v23, s14, 24
+; CHECK-NEXT:    v_writelane_b32 v23, s15, 25
+; CHECK-NEXT:    v_writelane_b32 v23, s16, 26
+; CHECK-NEXT:    v_writelane_b32 v23, s17, 27
+; CHECK-NEXT:    v_writelane_b32 v23, s18, 28
+; CHECK-NEXT:    v_writelane_b32 v23, s19, 29
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[38:39]
+; CHECK-NEXT:    ; def s[42:43]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[44:47]
+; CHECK-NEXT:    ; def s[52:55]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[4:11]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 26
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 27
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 28
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 29
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 30
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 31
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 32
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 33
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 30
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 31
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 32
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 33
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 34
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 35
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 36
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 37
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[16:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[36:37]
+; CHECK-NEXT:    ; def s[40:41]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[40:43]
+; CHECK-NEXT:    ; def s[36:39]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:7]
+; CHECK-NEXT:    ; def s[44:51]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 34
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 35
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 36
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 37
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 38
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 39
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 40
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 41
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 42
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 43
-; CHECK-NEXT:    v_writelane_b32 v23, s2, 44
-; CHECK-NEXT:    v_writelane_b32 v23, s3, 45
-; CHECK-NEXT:    v_writelane_b32 v23, s4, 46
-; CHECK-NEXT:    v_writelane_b32 v23, s5, 47
-; CHECK-NEXT:    v_writelane_b32 v23, s6, 48
-; CHECK-NEXT:    v_writelane_b32 v23, s7, 49
-; CHECK-NEXT:    v_writelane_b32 v23, s8, 50
-; CHECK-NEXT:    v_writelane_b32 v23, s9, 51
-; CHECK-NEXT:    v_writelane_b32 v23, s10, 52
-; CHECK-NEXT:    v_writelane_b32 v23, s11, 53
-; CHECK-NEXT:    v_writelane_b32 v23, s12, 54
-; CHECK-NEXT:    v_writelane_b32 v23, s13, 55
-; CHECK-NEXT:    v_writelane_b32 v23, s14, 56
-; CHECK-NEXT:    v_writelane_b32 v23, s15, 57
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 38
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 39
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 40
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 41
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 42
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 43
+; CHECK-NEXT:    v_writelane_b32 v23, s6, 44
+; CHECK-NEXT:    v_writelane_b32 v23, s7, 45
+; CHECK-NEXT:    v_writelane_b32 v23, s8, 46
+; CHECK-NEXT:    v_writelane_b32 v23, s9, 47
+; CHECK-NEXT:    v_writelane_b32 v23, s10, 48
+; CHECK-NEXT:    v_writelane_b32 v23, s11, 49
+; CHECK-NEXT:    v_writelane_b32 v23, s12, 50
+; CHECK-NEXT:    v_writelane_b32 v23, s13, 51
+; CHECK-NEXT:    v_writelane_b32 v23, s14, 52
+; CHECK-NEXT:    v_writelane_b32 v23, s15, 53
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[34:35]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_writelane_b32 v23, s0, 54
+; CHECK-NEXT:    v_writelane_b32 v23, s1, 55
+; CHECK-NEXT:    v_writelane_b32 v23, s2, 56
+; CHECK-NEXT:    v_writelane_b32 v23, s3, 57
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s[0:7]
+; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_writelane_b32 v23, s0, 58
 ; CHECK-NEXT:    v_writelane_b32 v23, s1, 59
 ; CHECK-NEXT:    v_writelane_b32 v23, s2, 60
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:    v_writelane_b32 v23, s3, 61
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v23, s0, 62
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 0
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 1
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 2
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 3
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 4
-; CHECK-NEXT:    v_writelane_b32 v23, s1, 63
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 5
+; CHECK-NEXT:    v_writelane_b32 v23, s4, 62
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 0
+; CHECK-NEXT:    v_writelane_b32 v23, s5, 63
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 1
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 6
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 7
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 8
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 9
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 10
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 11
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 12
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 13
-; CHECK-NEXT:    v_writelane_b32 v0, s8, 14
-; CHECK-NEXT:    v_writelane_b32 v0, s9, 15
-; CHECK-NEXT:    v_writelane_b32 v0, s10, 16
-; CHECK-NEXT:    v_writelane_b32 v0, s11, 17
-; CHECK-NEXT:    v_writelane_b32 v0, s12, 18
-; CHECK-NEXT:    v_writelane_b32 v0, s13, 19
-; CHECK-NEXT:    v_writelane_b32 v0, s14, 20
-; CHECK-NEXT:    v_writelane_b32 v0, s15, 21
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 2
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 3
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 4
+; CHECK-NEXT:    v_writelane_b32 v0, s3, 5
+; CHECK-NEXT:    v_writelane_b32 v0, s4, 6
+; CHECK-NEXT:    v_writelane_b32 v0, s5, 7
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 8
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 9
+; CHECK-NEXT:    v_writelane_b32 v0, s8, 10
+; CHECK-NEXT:    v_writelane_b32 v0, s9, 11
+; CHECK-NEXT:    v_writelane_b32 v0, s10, 12
+; CHECK-NEXT:    v_writelane_b32 v0, s11, 13
+; CHECK-NEXT:    v_writelane_b32 v0, s12, 14
+; CHECK-NEXT:    v_writelane_b32 v0, s13, 15
+; CHECK-NEXT:    v_writelane_b32 v0, s14, 16
+; CHECK-NEXT:    v_writelane_b32 v0, s15, 17
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 22
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 23
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 18
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 19
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 20
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 21
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 22
+; CHECK-NEXT:    v_writelane_b32 v0, s3, 23
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; def s[0:7]
+; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_writelane_b32 v0, s0, 24
 ; CHECK-NEXT:    v_writelane_b32 v0, s1, 25
 ; CHECK-NEXT:    v_writelane_b32 v0, s2, 26
 ; CHECK-NEXT:    v_writelane_b32 v0, s3, 27
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; def s[0:7]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 28
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 29
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 30
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 31
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 32
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 33
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 34
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 35
+; CHECK-NEXT:    v_writelane_b32 v0, s4, 28
+; CHECK-NEXT:    v_writelane_b32 v0, s5, 29
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 30
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 31
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; def s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_writelane_b32 v0, s0, 36
-; CHECK-NEXT:    v_writelane_b32 v0, s1, 37
-; CHECK-NEXT:    v_writelane_b32 v0, s2, 38
-; CHECK-NEXT:    v_writelane_b32 v0, s3, 39
-; CHECK-NEXT:    v_writelane_b32 v0, s4, 40
-; CHECK-NEXT:    v_writelane_b32 v0, s5, 41
-; CHECK-NEXT:    v_writelane_b32 v0, s6, 42
-; CHECK-NEXT:    v_writelane_b32 v0, s7, 43
-; CHECK-NEXT:    v_writelane_b32 v0, s8, 44
-; CHECK-NEXT:    v_writelane_b32 v0, s9, 45
-; CHECK-NEXT:    v_writelane_b32 v0, s10, 46
-; CHECK-NEXT:    v_writelane_b32 v0, s11, 47
-; CHECK-NEXT:    v_writelane_b32 v0, s12, 48
-; CHECK-NEXT:    v_writelane_b32 v0, s13, 49
-; CHECK-NEXT:    v_writelane_b32 v0, s14, 50
-; CHECK-NEXT:    v_writelane_b32 v0, s15, 51
+; CHECK-NEXT:    v_writelane_b32 v0, s0, 32
+; CHECK-NEXT:    v_writelane_b32 v0, s1, 33
+; CHECK-NEXT:    v_writelane_b32 v0, s2, 34
+; CHECK-NEXT:    v_writelane_b32 v0, s3, 35
+; CHECK-NEXT:    v_writelane_b32 v0, s4, 36
+; CHECK-NEXT:    v_writelane_b32 v0, s5, 37
+; CHECK-NEXT:    v_writelane_b32 v0, s6, 38
+; CHECK-NEXT:    v_writelane_b32 v0, s7, 39
+; CHECK-NEXT:    v_writelane_b32 v0, s8, 40
+; CHECK-NEXT:    v_writelane_b32 v0, s9, 41
+; CHECK-NEXT:    v_writelane_b32 v0, s10, 42
+; CHECK-NEXT:    v_writelane_b32 v0, s11, 43
+; CHECK-NEXT:    v_writelane_b32 v0, s12, 44
+; CHECK-NEXT:    v_writelane_b32 v0, s13, 45
+; CHECK-NEXT:    v_writelane_b32 v0, s14, 46
+; CHECK-NEXT:    v_writelane_b32 v0, s15, 47
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %ret
 ; CHECK-NEXT:    ; kill: killed $vgpr23
@@ -214,170 +210,166 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
 ; CHECK-NEXT:    v_readlane_b32 s1, v23, 3
 ; CHECK-NEXT:    v_readlane_b32 s2, v23, 4
 ; CHECK-NEXT:    v_readlane_b32 s3, v23, 5
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 6
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 7
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 8
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 9
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[48:51]
+; CHECK-NEXT:    ; use s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 6
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 7
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 8
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 9
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 10
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 11
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 12
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 13
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 10
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 11
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 12
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 13
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 14
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 15
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 16
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 17
-; CHECK-NEXT:    v_readlane_b32 s8, v23, 18
-; CHECK-NEXT:    v_readlane_b32 s9, v23, 19
-; CHECK-NEXT:    v_readlane_b32 s10, v23, 20
-; CHECK-NEXT:    v_readlane_b32 s11, v23, 21
-; CHECK-NEXT:    v_readlane_b32 s12, v23, 22
-; CHECK-NEXT:    v_readlane_b32 s13, v23, 23
-; CHECK-NEXT:    v_readlane_b32 s14, v23, 24
-; CHECK-NEXT:    v_readlane_b32 s15, v23, 25
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 14
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 15
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 16
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 17
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 18
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 19
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 20
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 21
+; CHECK-NEXT:    v_readlane_b32 s8, v23, 22
+; CHECK-NEXT:    v_readlane_b32 s9, v23, 23
+; CHECK-NEXT:    v_readlane_b32 s10, v23, 24
+; CHECK-NEXT:    v_readlane_b32 s11, v23, 25
+; CHECK-NEXT:    v_readlane_b32 s12, v23, 26
+; CHECK-NEXT:    v_readlane_b32 s13, v23, 27
+; CHECK-NEXT:    v_readlane_b32 s14, v23, 28
+; CHECK-NEXT:    v_readlane_b32 s15, v23, 29
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 26
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 27
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 28
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 29
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 30
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 31
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 32
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 33
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 30
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 31
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 32
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 33
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 34
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 35
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 36
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 37
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[38:39]
+; CHECK-NEXT:    ; use s[42:43]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[44:47]
+; CHECK-NEXT:    ; use s[52:55]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 34
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 35
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 36
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 37
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 38
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 39
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 40
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 41
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 38
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 39
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 40
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 41
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[16:31]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[36:37]
+; CHECK-NEXT:    ; use s[40:41]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[40:43]
+; CHECK-NEXT:    ; use s[36:39]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:7]
+; CHECK-NEXT:    ; use s[44:51]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 42
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 43
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 44
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 45
-; CHECK-NEXT:    v_readlane_b32 s4, v23, 46
-; CHECK-NEXT:    v_readlane_b32 s5, v23, 47
-; CHECK-NEXT:    v_readlane_b32 s6, v23, 48
-; CHECK-NEXT:    v_readlane_b32 s7, v23, 49
-; CHECK-NEXT:    v_readlane_b32 s8, v23, 50
-; CHECK-NEXT:    v_readlane_b32 s9, v23, 51
-; CHECK-NEXT:    v_readlane_b32 s10, v23, 52
-; CHECK-NEXT:    v_readlane_b32 s11, v23, 53
-; CHECK-NEXT:    v_readlane_b32 s12, v23, 54
-; CHECK-NEXT:    v_readlane_b32 s13, v23, 55
-; CHECK-NEXT:    v_readlane_b32 s14, v23, 56
-; CHECK-NEXT:    v_readlane_b32 s15, v23, 57
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 42
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 43
+; CHECK-NEXT:    v_readlane_b32 s6, v23, 44
+; CHECK-NEXT:    v_readlane_b32 s7, v23, 45
+; CHECK-NEXT:    v_readlane_b32 s8, v23, 46
+; CHECK-NEXT:    v_readlane_b32 s9, v23, 47
+; CHECK-NEXT:    v_readlane_b32 s10, v23, 48
+; CHECK-NEXT:    v_readlane_b32 s11, v23, 49
+; CHECK-NEXT:    v_readlane_b32 s12, v23, 50
+; CHECK-NEXT:    v_readlane_b32 s13, v23, 51
+; CHECK-NEXT:    v_readlane_b32 s14, v23, 52
+; CHECK-NEXT:    v_readlane_b32 s15, v23, 53
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 58
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 59
-; CHECK-NEXT:    v_readlane_b32 s2, v23, 60
-; CHECK-NEXT:    v_readlane_b32 s3, v23, 61
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 54
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 55
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 56
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 57
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[34:35]
 ; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:3]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v23, 62
-; CHECK-NEXT:    v_readlane_b32 s1, v23, 63
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 0
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 1
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 2
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 3
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 4
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 5
+; CHECK-NEXT:    v_readlane_b32 s0, v23, 58
+; CHECK-NEXT:    v_readlane_b32 s1, v23, 59
+; CHECK-NEXT:    v_readlane_b32 s2, v23, 60
+; CHECK-NEXT:    v_readlane_b32 s3, v23, 61
+; CHECK-NEXT:    v_readlane_b32 s4, v23, 62
+; CHECK-NEXT:    v_readlane_b32 s5, v23, 63
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 0
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 1
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 6
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 7
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 8
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 9
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 10
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 11
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 12
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 13
-; CHECK-NEXT:    v_readlane_b32 s8, v0, 14
-; CHECK-NEXT:    v_readlane_b32 s9, v0, 15
-; CHECK-NEXT:    v_readlane_b32 s10, v0, 16
-; CHECK-NEXT:    v_readlane_b32 s11, v0, 17
-; CHECK-NEXT:    v_readlane_b32 s12, v0, 18
-; CHECK-NEXT:    v_readlane_b32 s13, v0, 19
-; CHECK-NEXT:    v_readlane_b32 s14, v0, 20
-; CHECK-NEXT:    v_readlane_b32 s15, v0, 21
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 2
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 3
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 4
+; CHECK-NEXT:    v_readlane_b32 s3, v0, 5
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 6
+; CHECK-NEXT:    v_readlane_b32 s5, v0, 7
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 8
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 9
+; CHECK-NEXT:    v_readlane_b32 s8, v0, 10
+; CHECK-NEXT:    v_readlane_b32 s9, v0, 11
+; CHECK-NEXT:    v_readlane_b32 s10, v0, 12
+; CHECK-NEXT:    v_readlane_b32 s11, v0, 13
+; CHECK-NEXT:    v_readlane_b32 s12, v0, 14
+; CHECK-NEXT:    v_readlane_b32 s13, v0, 15
+; CHECK-NEXT:    v_readlane_b32 s14, v0, 16
+; CHECK-NEXT:    v_readlane_b32 s15, v0, 17
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 22
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 23
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 18
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 19
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 20
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 21
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 22
+; CHECK-NEXT:    v_readlane_b32 s3, v0, 23
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ; use s[0:3]
+; CHECK-NEXT:    ;;#ASMEND
 ; CHECK-NEXT:    v_readlane_b32 s0, v0, 24
 ; CHECK-NEXT:    v_readlane_b32 s1, v0, 25
 ; CHECK-NEXT:    v_readlane_b32 s2, v0, 26
 ; CHECK-NEXT:    v_readlane_b32 s3, v0, 27
-; CHECK-NEXT:    ;;#ASMSTART
-; CHECK-NEXT:    ; use s[0:3]
-; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 28
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 29
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 30
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 31
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 32
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 33
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 34
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 35
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 28
+; CHECK-NEXT:    v_readlane_b32 s5, v0, 29
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 30
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 31
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:7]
 ; CHECK-NEXT:    ;;#ASMEND
-; CHECK-NEXT:    v_readlane_b32 s0, v0, 36
-; CHECK-NEXT:    v_readlane_b32 s1, v0, 37
-; CHECK-NEXT:    v_readlane_b32 s2, v0, 38
-; CHECK-NEXT:    v_readlane_b32 s3, v0, 39
-; CHECK-NEXT:    v_readlane_b32 s4, v0, 40
-; CHECK-NEXT:    v_readlane_b32 s5, v0, 41
-; CHECK-NEXT:    v_readlane_b32 s6, v0, 42
-; CHECK-NEXT:    v_readlane_b32 s7, v0, 43
-; CHECK-NEXT:    v_readlane_b32 s8, v0, 44
-; CHECK-NEXT:    v_readlane_b32 s9, v0, 45
-; CHECK-NEXT:    v_readlane_b32 s10, v0, 46
-; CHECK-NEXT:    v_readlane_b32 s11, v0, 47
-; CHECK-NEXT:    v_readlane_b32 s12, v0, 48
-; CHECK-NEXT:    v_readlane_b32 s13, v0, 49
-; CHECK-NEXT:    v_readlane_b32 s14, v0, 50
-; CHECK-NEXT:    v_readlane_b32 s15, v0, 51
+; CHECK-NEXT:    v_readlane_b32 s0, v0, 32
+; CHECK-NEXT:    v_readlane_b32 s1, v0, 33
+; CHECK-NEXT:    v_readlane_b32 s2, v0, 34
+; CHECK-NEXT:    v_readlane_b32 s3, v0, 35
+; CHECK-NEXT:    v_readlane_b32 s4, v0, 36
+; CHECK-NEXT:    v_readlane_b32 s5, v0, 37
+; CHECK-NEXT:    v_readlane_b32 s6, v0, 38
+; CHECK-NEXT:    v_readlane_b32 s7, v0, 39
+; CHECK-NEXT:    v_readlane_b32 s8, v0, 40
+; CHECK-NEXT:    v_readlane_b32 s9, v0, 41
+; CHECK-NEXT:    v_readlane_b32 s10, v0, 42
+; CHECK-NEXT:    v_readlane_b32 s11, v0, 43
+; CHECK-NEXT:    v_readlane_b32 s12, v0, 44
+; CHECK-NEXT:    v_readlane_b32 s13, v0, 45
+; CHECK-NEXT:    v_readlane_b32 s14, v0, 46
+; CHECK-NEXT:    v_readlane_b32 s15, v0, 47
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:15]
 ; CHECK-NEXT:    ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index 2292b45ddf15ae..d430ba758572d6 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -7,7 +7,7 @@
 define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 {
 ; GCN-LABEL: partial_no_vgprs_last_sgpr_spill:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_u32 s0, s0, s15
+; GCN-NEXT:    s_add_u32 s0, s0, s13
 ; GCN-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
 ; GCN-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index f69b7ae1051240..ebc916b5c889b5 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -182,10 +182,8 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: s_shl_i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, s4, 64
 ; GCN-NEXT:    s_sub_i32 s12, 64, s4
@@ -205,7 +203,6 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, s6
 ; GCN-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
-; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
@@ -218,10 +215,8 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: s_lshr_i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, s4, 64
 ; GCN-NEXT:    s_sub_i32 s12, 64, s4
@@ -241,7 +236,6 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
@@ -254,10 +248,8 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: s_ashr_i128_ss:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx8 s[0:7], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s5, 64, s4
 ; GCN-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
@@ -278,7 +270,6 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v5, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
@@ -439,9 +430,6 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_shl_v2i128ss:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v6, 16
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
@@ -514,9 +502,6 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_lshr_v2i128_ss:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v6, 16
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
@@ -589,9 +574,6 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-LABEL: s_ashr_v2i128_ss:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_load_dwordx16 s[0:15], s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v6, 16
 ; GCN-NEXT:    v_mov_b32_e32 v4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index a38f0a6d86b8ca..b8721129222043 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -9,9 +9,6 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s2
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
@@ -23,9 +20,6 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -44,14 +38,11 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -61,14 +52,11 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -84,9 +72,6 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_bitcmp1_b32 s2, 0
 ; CI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -101,9 +86,6 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitcmp1_b32 s2, 0
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -122,9 +104,6 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; CI-LABEL: s_sint_to_fp_i64_to_f64:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s3
 ; CI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -138,9 +117,6 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; VI-LABEL: s_sint_to_fp_i64_to_f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s3
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -160,9 +136,6 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mov_b32_e32 v1, s3
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
@@ -182,9 +155,6 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -213,9 +183,6 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_sext_i32_i8 s2, s2
 ; CI-NEXT:    v_cvt_f64_i32_e32 v[0:1], s2
@@ -228,9 +195,6 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bfe_i32 s2, s2, 0x80000
 ; VI-NEXT:    s_sext_i32_i16 s2, s2
@@ -268,14 +232,11 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -285,14 +246,11 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -325,14 +283,11 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -342,14 +297,11 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -401,14 +353,11 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; CI-NEXT:    s_add_i32 s10, s10, s15
-; CI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; CI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_cmp_eq_u32 s2, 0
 ; CI-NEXT:    s_cselect_b32 s2, 0, 0xbff00000
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    v_mov_b32_e32 v1, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -418,14 +367,11 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0, 0xbff00000
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index 0a6009c0e7da8a..b4a981f1db4ec7 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -12,10 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
   ; GCN-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
   ; GCN-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
   ; GCN-NEXT:   [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %31.sub0
-  ; GCN-NEXT:   SI_SPILL_V64_SAVE %31, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
+  ; GCN-NEXT:   INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %30.sub0
+  ; GCN-NEXT:   SI_SPILL_V64_SAVE %30, %stack.0, $sgpr32, 0, implicit $exec :: (store (s64) into %stack.0, align 4, addrspace 5)
   ; GCN-NEXT:   [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
-  ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %23:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+  ; GCN-NEXT:   GLOBAL_STORE_DWORDX4 undef %22:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
   ; GCN-NEXT:   [[SI_SPILL_V64_RESTORE:%[0-9]+]]:vreg_64 = SI_SPILL_V64_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s64) from %stack.0, align 4, addrspace 5)
   ; GCN-NEXT:   INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, [[SI_SPILL_V64_RESTORE]]
   ; GCN-NEXT:   S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index b2e334c66ccd25..3644bef9c20a1f 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -50,10 +50,7 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 {
 define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
 ; HAWAII-LABEL: local_store_i55:
 ; HAWAII:       ; %bb.0:
-; HAWAII-NEXT:    s_add_i32 s10, s10, s15
 ; HAWAII-NEXT:    s_or_b32 s0, s6, 14
-; HAWAII-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; HAWAII-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; HAWAII-NEXT:    v_mov_b32_e32 v0, s0
 ; HAWAII-NEXT:    v_mov_b32_e32 v1, s7
 ; HAWAII-NEXT:    flat_load_ubyte v0, v[0:1]
@@ -73,10 +70,7 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
 ;
 ; FIJI-LABEL: local_store_i55:
 ; FIJI:       ; %bb.0:
-; FIJI-NEXT:    s_add_i32 s10, s10, s15
 ; FIJI-NEXT:    s_or_b32 s0, s6, 14
-; FIJI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; FIJI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; FIJI-NEXT:    v_mov_b32_e32 v0, s0
 ; FIJI-NEXT:    v_mov_b32_e32 v1, s7
 ; FIJI-NEXT:    flat_load_ubyte v0, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index 30accc846d2b6a..19d633651fdd0d 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000  ................
-; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
+; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000  @...............
 
 ; ELF: AMDGPU Metadata
 ; ELF: .sgpr_count:     9
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index 4f84b31f1877b0..2097579e0c9959 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
 ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000  ................
-; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......
+; OBJ-NEXT: 0030 0000af00 88000000 01000000 00000000  ................
 
 ; ELF: AMDGPU Metadata
 ; ELF: .sgpr_count:     5
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index 0b1bd11b88d5d6..775c62e73261a9 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -1,4 +1,3 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack < %s | FileCheck --check-prefixes=ASM %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-objdump -s -j .rodata - | FileCheck --check-prefixes=OBJ %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+xnack --filetype=obj < %s | llvm-readelf --notes - | FileCheck --check-prefixes=ELF %s
@@ -7,17 +6,15 @@
 
 define amdgpu_kernel void @kern() #0 {
 ; ASM-LABEL: kern:
-; ASM:       ; %bb.0: ; %entry
-; ASM-NEXT:    ;;#ASMSTART
-; ASM-NEXT:    ;;#ASMEND
-; ASM-NEXT:    s_endpgm
+; ASM: .amdhsa_next_free_sgpr 5
+; ASM: .amdhsa_reserve_xnack_mask 1
 
 ; Verify that an extra SGPR block is reserved with XNACK "on" tid setting.
 ; OBJ: Contents of section .rodata:
 ; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000  ................
 ; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000  ................
-; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000  @.......!.......
+; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000  @...............
 
 ; ELF: AMDGPU Metadata
 ; ELF: .sgpr_count:     9
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index 3d1f283270ccfd..52370f6a2ef054 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -23,14 +23,11 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
 ; HSA-TRAP-GFX803-LABEL: trap:
 ; HSA-TRAP-GFX803:       ; %bb.0:
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x0
-; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
-; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v2, 1
+; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s2
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s3
-; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; HSA-TRAP-GFX803-NEXT:    flat_store_dword v[0:1], v2
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    s_trap 2
@@ -124,9 +121,6 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
 ; HSA-TRAP-GFX803-LABEL: non_entry_trap:
 ; HSA-TRAP-GFX803:       ; %bb.0: ; %entry
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
-; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s0
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s1
@@ -289,9 +283,6 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
 ; HSA-TRAP-GFX803:       ; %bb.0:
 ; HSA-TRAP-GFX803-NEXT:    s_mov_b64 s[0:1], s[6:7]
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
-; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
-; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s4
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s5
@@ -426,13 +417,10 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
 ; HSA-TRAP-GFX803-LABEL: debugtrap:
 ; HSA-TRAP-GFX803:       ; %bb.0:
 ; HSA-TRAP-GFX803-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; HSA-TRAP-GFX803-NEXT:    s_add_i32 s12, s12, s17
-; HSA-TRAP-GFX803-NEXT:    s_mov_b32 flat_scratch_lo, s13
-; HSA-TRAP-GFX803-NEXT:    s_lshr_b32 flat_scratch_hi, s12, 8
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v2, 1
+; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v3, 2
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt lgkmcnt(0)
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v0, s0
-; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v3, 2
 ; HSA-TRAP-GFX803-NEXT:    v_mov_b32_e32 v1, s1
 ; HSA-TRAP-GFX803-NEXT:    flat_store_dword v[0:1], v2
 ; HSA-TRAP-GFX803-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 317e350f3eafe5..735956caa72da4 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -81,9 +81,6 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-LABEL: udiv_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -255,9 +252,6 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GCN-LABEL: s_udiv_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GCN-NEXT:    s_sub_i32 s4, 0, s3
@@ -463,9 +457,6 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: udiv_v2i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -819,9 +810,6 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: udiv_v4i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-NEXT:    s_addc_u32 s5, s3, 0
@@ -1147,9 +1135,6 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac
 ; GCN-LABEL: udiv_i32_div_pow2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1239,9 +1224,6 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
 ; GCN-LABEL: udiv_i32_div_k_even:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1336,9 +1318,6 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
 ; GCN-LABEL: udiv_i32_div_k_odd:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1451,9 +1430,6 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-LABEL: v_udiv_i8:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1594,9 +1570,6 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: v_udiv_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s3
@@ -1753,9 +1726,6 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: v_udiv_i23:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s4, s2, 4
 ; GCN-NEXT:    s_addc_u32 s5, s3, 0
@@ -1953,9 +1923,6 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-LABEL: v_udiv_i24:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_add_u32 s4, s2, 4
 ; GCN-NEXT:    s_addc_u32 s5, s3, 0
@@ -2138,9 +2105,6 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
 ; GCN-LABEL: scalarize_mulhu_4xi32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -2254,9 +2218,6 @@ define amdgpu_kernel void @test_udiv2(i32 %p) {
 ; GCN-LABEL: test_udiv2:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshr_b32 s0, s0, 1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
@@ -2320,9 +2281,6 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0xaaaaaaab
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_mul_hi_u32 v0, s0, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
@@ -2413,9 +2371,6 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon
 ;
 ; GCN-LABEL: fdiv_test_denormals:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    flat_load_sbyte v2, v[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 44b16d7f65dc55..d00ea6dff24474 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -9,9 +9,6 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
 ; SI-NEXT:    v_add_i32_e32 v0, vcc, s2, v0
@@ -31,9 +28,6 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
@@ -60,9 +54,6 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; SI-LABEL: s_uint_to_fp_i64_to_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -76,9 +67,6 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; VI-LABEL: s_uint_to_fp_i64_to_f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
@@ -98,9 +86,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s1
@@ -118,9 +103,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
 ; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[2:3], s1
@@ -144,9 +126,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
 define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4 x i64> %in) {
 ; SI-LABEL: s_uint_to_fp_v4i64_to_v4f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x8
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -179,9 +158,6 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
 ;
 ; VI-LABEL: s_uint_to_fp_v4i64_to_v4f64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_load_dwordx8 s[8:15], s[6:7], 0x20
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
@@ -220,9 +196,6 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
@@ -234,9 +207,6 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -252,9 +222,6 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2
 ; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x0
-; GCN-NEXT:    s_add_i32 s10, s10, s15
-; GCN-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; GCN-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[2:3], s3
 ; GCN-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
@@ -272,9 +239,6 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x4
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[6:7], s3
@@ -295,9 +259,6 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[6:7], 0x10
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], s3
@@ -325,14 +286,11 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -342,14 +300,11 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -365,9 +320,6 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_bitcmp1_b32 s2, 0
 ; SI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -382,9 +334,6 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_bitcmp1_b32 s2, 0
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
@@ -404,9 +353,6 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_and_b32 s2, s2, 0xff
 ; SI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
@@ -419,9 +365,6 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_and_b32 s2, s2, 0xff
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
@@ -459,14 +402,11 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -476,14 +416,11 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -516,14 +453,11 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -533,14 +467,11 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -574,14 +505,11 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dword s2, s[6:7], 0x2
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; SI-NEXT:    s_add_i32 s10, s10, s15
-; SI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; SI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_cmp_eq_u32 s2, 0
 ; SI-NEXT:    s_cselect_b32 s2, 0, 0x3ff00000
 ; SI-NEXT:    v_mov_b32_e32 v3, s1
-; SI-NEXT:    v_mov_b32_e32 v0, 0
 ; SI-NEXT:    v_mov_b32_e32 v1, s2
 ; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -591,14 +519,11 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dword s2, s[6:7], 0x8
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[6:7], 0x0
-; VI-NEXT:    s_add_i32 s10, s10, s15
-; VI-NEXT:    s_lshr_b32 flat_scratch_hi, s10, 8
-; VI-NEXT:    s_mov_b32 flat_scratch_lo, s11
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_cmp_eq_u32 s2, 0
 ; VI-NEXT:    s_cselect_b32 s2, 0, 0x3ff00000
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    v_mov_b32_e32 v1, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index 3f2b046ef16615..50927a2cf21afe 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -11,7 +11,7 @@
 define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-LABEL: __omp_offloading_16_dd2df_main_l9:
 ; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_add_u32 s0, s0, s15
+; CHECK-NEXT:    s_add_u32 s0, s0, s13
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    v_mov_b32_e32 v2, v0
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index fa4db7d4a2d550..3046480b3c0d2a 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -25,9 +25,8 @@
 ; CHECK-NEXT: argumentInfo:
 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr8' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' }
+; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr6' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
 ; CHECK-NEXT: workItemIDX:     { reg: '$vgpr0' }
 ; CHECK-NEXT: psInputAddr:     0
 ; CHECK-NEXT: psInputEnable:   0
@@ -41,7 +40,7 @@
 ; CHECK-NEXT: BitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy:       8
 ; CHECK-NEXT: vgprForAGPRCopy: ''
-; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99'
+; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
 ; CHECK-NEXT: body:
   define amdgpu_kernel void @uniform_long_forward_branch_debug(ptr addrspace(1) %arg, i32 %arg1) #0 !dbg !5 {
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index 99c3ed91351dc1..3f6f0c909e8bbf 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -25,9 +25,8 @@
 ; CHECK-NEXT: argumentInfo:
 ; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
 ; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
-; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr8' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' }
+; CHECK-NEXT: workGroupIDX:    { reg: '$sgpr6' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
 ; CHECK-NEXT: workItemIDX:     { reg: '$vgpr0' }
 ; CHECK-NEXT: psInputAddr:     0
 ; CHECK-NEXT: psInputEnable:   0
@@ -41,7 +40,7 @@
 ; CHECK-NEXT: BitsOf32BitAddress: 0
 ; CHECK-NEXT: occupancy:       8
 ; CHECK-NEXT: vgprForAGPRCopy: ''
-; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99'
+; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
 ; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
 ; CHECK-NEXT: body:
 define amdgpu_kernel void @uniform_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) #0 {

>From 6e493899c91a0cece67bf857cfef0564c66ea026 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Tue, 3 Sep 2024 16:25:45 -0700
Subject: [PATCH 5/8] Fix tests after merge from main branch

---
 clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu  | 2 +-
 .../AMDGPU/attributor-flatscratchinit-globalisel.ll       | 8 ++++----
 llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll    | 8 ++++----
 llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll          | 2 +-
 llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll  | 2 +-
 llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll        | 3 ++-
 llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll          | 2 +-
 7 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
index b295bbbdaaf955..7f016180629e4b 100644
--- a/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-kernel-arg-pointer-type.cu
@@ -432,7 +432,7 @@ __global__ void kernel4(struct S s) {
 // CHECK-SPIRV-NEXT:    ret void
 //
 // OPT-LABEL: define dso_local amdgpu_kernel void @_Z7kernel5P1S(
-// OPT-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// OPT-SAME: ptr addrspace(1) nocapture noundef readonly [[S_COERCE:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // OPT-NEXT:  [[ENTRY:.*:]]
 // OPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(1) [[S_COERCE]], align 8
 // OPT-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
index b7503f26b1ab6d..9efdf3cbb86067 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
@@ -4,7 +4,7 @@
 ;; tests of alloca
 
 define void @without_alloca(i1 %arg0) {
-  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg0, ptr addrspace(1) null
   ret void
 }
 
@@ -15,7 +15,7 @@ define void @with_alloca() {
 }
 
 define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
-  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg0, ptr addrspace(1) null
   ret void
 }
 
@@ -453,13 +453,13 @@ declare i32 @llvm.amdgcn.workgroup.id.x()
 
 define void @use_intrinsic_workitem_id_x() {
   %val = call i32 @llvm.amdgcn.workitem.id.x()
-  store volatile i32 %val, ptr addrspace(1) undef
+  store volatile i32 %val, ptr addrspace(1) null
   ret void
 }
 
 define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
   %val = call i32 @llvm.amdgcn.workitem.id.x()
-  store volatile i32 %val, ptr addrspace(1) undef
+  store volatile i32 %val, ptr addrspace(1) null
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index f04c93961b6705..d32ef070b983e4 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -10,7 +10,7 @@ define void @without_alloca(i1 %arg0) {
 ;
 ; GFX10-LABEL: define void @without_alloca(i1 %arg0)
 ; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
-  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg0, ptr addrspace(1) null
   ret void
 }
 
@@ -31,7 +31,7 @@ define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
 ; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
-  store volatile i1 %arg0, ptr addrspace(1) undef
+  store volatile i1 %arg0, ptr addrspace(1) null
   ret void
 }
 
@@ -858,7 +858,7 @@ define void @use_intrinsic_workitem_id_x() {
 ; GFX10-LABEL: define void @use_intrinsic_workitem_id_x()
 ; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
-  store volatile i32 %val, ptr addrspace(1) undef
+  store volatile i32 %val, ptr addrspace(1) null
   ret void
 }
 
@@ -869,7 +869,7 @@ define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
 ; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
-  store volatile i32 %val, ptr addrspace(1) undef
+  store volatile i32 %val, ptr addrspace(1) null
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
index ebca9906998787..55113a3b6f263e 100644
--- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
@@ -36,5 +36,5 @@ define amdgpu_kernel void @test_direct_indirect_call() {
 }
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index 074dba1cbcc931..df6a75c7376df4 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -43,5 +43,5 @@ attributes #0 = { "amdgpu-no-dispatch-id" }
 ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" }
 ;.
 ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
index 840e8bdfb7bab3..b588337da2868d 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
@@ -59,5 +59,6 @@ entry:
 
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
+; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
+; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index 183cdb2e1f8627..b5f5e3cf1a1a78 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -81,7 +81,7 @@ define amdgpu_kernel void @test_simple_indirect_call() {
 ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" }
 ;.
 ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
 ; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.

>From 19aedb9c522f520c74911bbdf425337ce2d02dc1 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Wed, 4 Sep 2024 14:20:24 -0700
Subject: [PATCH 6/8] Undo changes made to AMDGPUSubtarget.cpp to limit impact
 on tests. This undo is simply achieved by merging code from upstream because
 a recent commit has changed that file. The changes therein will be included
 in a separate PR.

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |   4 +-
 .../annotate-kernel-features-hsa-call.ll      |  89 ++---
 .../AMDGPU/annotate-kernel-features-hsa.ll    |  31 +-
 .../AMDGPU/attributor-flatscratchinit.ll      | 372 +++++++++---------
 .../CodeGen/AMDGPU/direct-indirect-call.ll    |   2 +-
 .../AMDGPU/duplicate-attribute-indirect.ll    |   2 +-
 .../CodeGen/AMDGPU/simple-indirect-call-2.ll  |   2 +-
 7 files changed, 250 insertions(+), 252 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index c4c3a06155029c..13fbe9d8f53d33 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -436,7 +436,9 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     }
 
     for (Instruction &I : instructions(F)) {
-      if (isa<AllocaInst>(I) || isa<AddrSpaceCastInst>(I)) {
+      if (isa<AddrSpaceCastInst>(I) &&
+          cast<AddrSpaceCastInst &>(I).getSrcAddressSpace() ==
+              AMDGPUAS::PRIVATE_ADDRESS) {
         removeAssumedBits(FLAT_SCRATCH_INIT);
         return;
       }
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index ab2e28e5f5cbb5..ed136c58379cb3 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -530,7 +530,7 @@ define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null)
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -545,7 +545,7 @@ define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null)
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -593,7 +593,7 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; ATTRIBUTOR_HSA-NEXT:    store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -611,7 +611,7 @@ define void @use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
 ; ATTRIBUTOR_HSA-NEXT:    store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -628,7 +628,7 @@ define void @func_indirect_use_implicitarg_ptr() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @use_implicitarg_ptr()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -645,7 +645,7 @@ define internal void @defined.func() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@defined.func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -658,7 +658,7 @@ define void @func_call_external() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_external
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @external.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -673,7 +673,7 @@ define void @func_call_defined() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_defined
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @defined.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -687,8 +687,8 @@ define void @func_call_asm() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] {
-; ATTRIBUTOR_HSA-NEXT:    call void asm sideeffect "", ""() #[[ATTR31:[0-9]+]]
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
+; ATTRIBUTOR_HSA-NEXT:    call void asm sideeffect "", ""() #[[ATTR28:[0-9]+]]
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   call void asm sideeffect "", ""() #3
@@ -702,7 +702,7 @@ define amdgpu_kernel void @kern_call_external() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_external
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @external.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -717,7 +717,7 @@ define amdgpu_kernel void @func_kern_defined() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_kern_defined
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR20]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR17]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @defined.func()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -751,7 +751,7 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @use_dispatch_ptr_ret_type()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -769,7 +769,7 @@ define float @func_indirect_call(ptr %fptr) #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call
-; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR19]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float [[FPTR]]()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -788,7 +788,7 @@ define float @func_extern_call() #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @extern()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -806,7 +806,7 @@ define float @func_null_call(ptr %fptr) #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call
-; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR19]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR16]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float null()
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -827,7 +827,7 @@ define float @func_other_intrinsic_call(float %arg) #3 {
 ; AKF_HSA-NEXT:    ret float [[FADD]]
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call
-; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR20]] {
+; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR17]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]])
 ; ATTRIBUTOR_HSA-NEXT:    [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
 ; ATTRIBUTOR_HSA-NEXT:    ret float [[FADD]]
@@ -845,7 +845,7 @@ define amdgpu_kernel void @kern_sanitize_address() #4 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) null, align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -861,7 +861,7 @@ define void @func_sanitize_address() #4 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR20:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) null, align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -877,7 +877,7 @@ define void @func_indirect_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR24:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR21:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -893,7 +893,7 @@ define amdgpu_kernel void @kern_indirect_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -912,7 +912,7 @@ define amdgpu_kernel void @kern_decl_sanitize_address() #3 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR21]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @extern_func_sanitize_address()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -928,7 +928,7 @@ define internal void @enqueue_block_def() #6 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR28:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   ret void
@@ -941,7 +941,7 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR29:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR26:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    call void @enqueue_block_decl()
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -995,7 +995,7 @@ define amdgpu_kernel void @kern_callsite_enqueue_block() {
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block
 ; ATTRIBUTOR_HSA-SAME: () #[[ATTR30]] {
-; ATTRIBUTOR_HSA-NEXT:    call void @known_func() #[[ATTR32:[0-9]+]]
+; ATTRIBUTOR_HSA-NEXT:    call void @known_func() #[[ATTR29:[0-9]+]]
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   call void @known_func() #6
@@ -1037,27 +1037,24 @@ attributes #6 = { "enqueued-block" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR23]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR24]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR26:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR27:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR30]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR31]] = { nounwind }
-; ATTRIBUTOR_HSA: attributes #[[ATTR32]] = { "enqueued-block" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR23:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR24:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR27:[0-9]+]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { nounwind }
+; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "enqueued-block" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index aabda171eecb22..5920b24b050beb 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -441,7 +441,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR12]] {
+; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(5) [[PTR]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -452,7 +452,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 
 define amdgpu_kernel void @use_flat_to_group_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_group_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
 ; HSA-NEXT:    store volatile i32 0, ptr addrspace(3) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -464,7 +464,7 @@ define amdgpu_kernel void @use_flat_to_group_addrspacecast(ptr %ptr) #1 {
 
 define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_private_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
 ; HSA-NEXT:    store volatile i32 0, ptr addrspace(5) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -483,7 +483,7 @@ define amdgpu_kernel void @use_global_to_flat_addrspacecast(ptr addrspace(1) %pt
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_global_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR13]] {
+; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
 ; ATTRIBUTOR_HSA-NEXT:    store volatile i32 0, ptr addrspace(1) [[PTR]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -500,7 +500,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_constant_to_flat_addrspacecast
-; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR13]] {
+; ATTRIBUTOR_HSA-SAME: (ptr addrspace(4) [[PTR:%.*]]) #[[ATTR1]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[PTR]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
@@ -511,7 +511,7 @@ define amdgpu_kernel void @use_constant_to_flat_addrspacecast(ptr addrspace(4) %
 
 define amdgpu_kernel void @use_flat_to_global_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_global_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1)
 ; HSA-NEXT:    store volatile i32 0, ptr addrspace(1) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -523,7 +523,7 @@ define amdgpu_kernel void @use_flat_to_global_addrspacecast(ptr %ptr) #1 {
 
 define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #1 {
 ; HSA-LABEL: define {{[^@]+}}@use_flat_to_constant_addrspacecast
-; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR13]] {
+; HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
 ; HSA-NEXT:    [[FTOS:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(4)
 ; HSA-NEXT:    [[LD:%.*]] = load volatile i32, ptr addrspace(4) [[FTOS]], align 4
 ; HSA-NEXT:    ret void
@@ -542,7 +542,7 @@ define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_shared
-; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR14:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]])
 ; ATTRIBUTOR_HSA-NEXT:    [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32
 ; ATTRIBUTOR_HSA-NEXT:    store i32 [[EXT]], ptr addrspace(1) undef, align 4
@@ -563,7 +563,7 @@ define amdgpu_kernel void @use_is_private(ptr %ptr) #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_private
-; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR14]] {
+; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]])
 ; ATTRIBUTOR_HSA-NEXT:    [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32
 ; ATTRIBUTOR_HSA-NEXT:    store i32 [[EXT]], ptr addrspace(1) undef, align 4
@@ -583,7 +583,7 @@ define amdgpu_kernel void @use_alloca() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR13]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
 ; ATTRIBUTOR_HSA-NEXT:    store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -604,7 +604,7 @@ define amdgpu_kernel void @use_alloca_non_entry_block() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR13]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
 ; ATTRIBUTOR_HSA-NEXT:  entry:
 ; ATTRIBUTOR_HSA-NEXT:    br label [[BB:%.*]]
 ; ATTRIBUTOR_HSA:       bb:
@@ -629,7 +629,7 @@ define void @use_alloca_func() #1 {
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_func
-; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
+; ATTRIBUTOR_HSA-SAME: () #[[ATTR14:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
 ; ATTRIBUTOR_HSA-NEXT:    store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
 ; ATTRIBUTOR_HSA-NEXT:    ret void
@@ -662,10 +662,9 @@ attributes #1 = { nounwind }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
 ; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index d32ef070b983e4..b33ec280b034c9 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -6,20 +6,20 @@
 
 define void @without_alloca(i1 %arg0) {
 ; GFX9-LABEL: define void @without_alloca(i1 %arg0)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @without_alloca(i1 %arg0)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI:[0-9]+]]
   store volatile i1 %arg0, ptr addrspace(1) null
   ret void
 }
 
 define void @with_alloca() {
 ; GFX9-LABEL: define void @with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -27,20 +27,20 @@ define void @with_alloca() {
 
 define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2:[0-9]+]]
   store volatile i1 %arg0, ptr addrspace(1) null
   ret void
 }
 
 define amdgpu_kernel void @with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -50,10 +50,10 @@ define amdgpu_kernel void @with_alloca_cc_kernel() {
 
 define amdgpu_vs void @with_alloca_cc_vs() {
 ; GFX9-LABEL: define amdgpu_vs void @with_alloca_cc_vs()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_vs void @with_alloca_cc_vs()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS:[0-9]+]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -61,10 +61,10 @@ define amdgpu_vs void @with_alloca_cc_vs() {
 
 define amdgpu_gs void @with_alloca_cc_gs() {
 ; GFX9-LABEL: define amdgpu_gs void @with_alloca_cc_gs()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
 ;
 ; GFX10-LABEL: define amdgpu_gs void @with_alloca_cc_gs()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -72,10 +72,10 @@ define amdgpu_gs void @with_alloca_cc_gs() {
 
 define amdgpu_ps void @with_alloca_cc_ps() {
 ; GFX9-LABEL: define amdgpu_ps void @with_alloca_cc_ps()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
 ;
 ; GFX10-LABEL: define amdgpu_ps void @with_alloca_cc_ps()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -83,10 +83,10 @@ define amdgpu_ps void @with_alloca_cc_ps() {
 
 define amdgpu_cs void @with_alloca_cc_cs() {
 ; GFX9-LABEL: define amdgpu_cs void @with_alloca_cc_cs()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
 ;
 ; GFX10-LABEL: define amdgpu_cs void @with_alloca_cc_cs()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -94,10 +94,10 @@ define amdgpu_cs void @with_alloca_cc_cs() {
 
 define amdgpu_hs void @with_alloca_cc_hs() {
 ; GFX9-LABEL: define amdgpu_hs void @with_alloca_cc_hs()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
 ;
 ; GFX10-LABEL: define amdgpu_hs void @with_alloca_cc_hs()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -105,10 +105,10 @@ define amdgpu_hs void @with_alloca_cc_hs() {
 
 define amdgpu_ls void @with_alloca_cc_ls() {
 ; GFX9-LABEL: define amdgpu_ls void @with_alloca_cc_ls()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
 ;
 ; GFX10-LABEL: define amdgpu_ls void @with_alloca_cc_ls()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -116,10 +116,10 @@ define amdgpu_ls void @with_alloca_cc_ls() {
 
 define amdgpu_es void @with_alloca_cc_es() {
 ; GFX9-LABEL: define amdgpu_es void @with_alloca_cc_es()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
 ;
 ; GFX10-LABEL: define amdgpu_es void @with_alloca_cc_es()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -127,10 +127,10 @@ define amdgpu_es void @with_alloca_cc_es() {
 
 define amdgpu_gfx void @with_alloca_cc_gfx() {
 ; GFX9-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR3_GFX9_CC_GRAPHICS2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR3_GFX10_CC_GRAPHICS2:[0-9]+]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -138,10 +138,10 @@ define amdgpu_gfx void @with_alloca_cc_gfx() {
 
 define amdgpu_cs_chain void @with_alloca_cc_cs_chain() {
 ; GFX9-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR3_GFX9_CC_GRAPHICS2]]
 ;
 ; GFX10-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR3_GFX10_CC_GRAPHICS2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -149,10 +149,10 @@ define amdgpu_cs_chain void @with_alloca_cc_cs_chain() {
 
 define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
 ; GFX9-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve()
-; GFX9-SAME:  #[[ATTR_GFX9_CC_GRAPHICS2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR3_GFX9_CC_GRAPHICS2]]
 ;
 ; GFX10-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve()
-; GFX10-SAME:  #[[ATTR_GFX10_CC_GRAPHICS2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR3_GFX10_CC_GRAPHICS2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   ret void
@@ -160,40 +160,40 @@ define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
 
 define void @call_without_alloca() {
 ; GFX9-LABEL: define void @call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   call void @without_alloca(i1 true)
   ret void
 }
 
 define amdgpu_kernel void @call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   call void @without_alloca(i1 true)
   ret void
 }
 
 define void @call_with_alloca() {
 ; GFX9-LABEL: define void @call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   call void @with_alloca()
   ret void
 }
 
 define amdgpu_kernel void @call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   call void @with_alloca()
   ret void
 }
@@ -222,50 +222,50 @@ define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
 
 define void @call_call_without_alloca() {
 ; GFX9-LABEL: define void @call_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   call void @call_without_alloca()
   ret void
 }
 
 define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   call void @call_without_alloca()
   ret void
 }
 
 define void @call_call_with_alloca() {
 ; GFX9-LABEL: define void @call_call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   call void @call_with_alloca()
   ret void
 }
 
 define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   call void @call_with_alloca()
   ret void
 }
 
 define void @with_alloca_call_without_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @without_alloca()
@@ -274,10 +274,10 @@ define void @with_alloca_call_without_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @without_alloca()
@@ -286,10 +286,10 @@ define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
 
 define void @with_alloca_call_with_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @with_alloca()
@@ -298,10 +298,10 @@ define void @with_alloca_call_with_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @with_alloca()
@@ -310,10 +310,10 @@ define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
 
 define void @with_alloca_call_call_without_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_without_alloca()
@@ -322,10 +322,10 @@ define void @with_alloca_call_call_without_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_without_alloca()
@@ -334,10 +334,10 @@ define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
 
 define void @with_alloca_call_call_with_alloca() {
 ; GFX9-LABEL: define void @with_alloca_call_call_with_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_alloca_call_call_with_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_with_alloca()
@@ -346,10 +346,10 @@ define void @with_alloca_call_call_with_alloca() {
 
 define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %temp = alloca i32, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %temp, align 4
   call void @call_with_alloca()
@@ -360,30 +360,30 @@ define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
 
 define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
 
 define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %stof = addrspacecast ptr addrspace(1) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -391,10 +391,10 @@ define void @with_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 
 define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %stof = addrspacecast ptr addrspace(1) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -402,30 +402,30 @@ define amdgpu_kernel void @with_global_to_flat_addrspacecast_cc_kernel(ptr addrs
 
 define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(2) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(2) %ptr
   ret void
 }
 
 define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %stof = addrspacecast ptr addrspace(2) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -433,10 +433,10 @@ define void @with_region_to_flat_addrspacecast(ptr addrspace(2) %ptr) {
 
 define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrspace(2) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %stof = addrspacecast ptr addrspace(2) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -444,30 +444,30 @@ define amdgpu_kernel void @with_region_to_flat_addrspacecast_cc_kernel(ptr addrs
 
 define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(3) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(3) %ptr
   ret void
 }
 
 define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -475,10 +475,10 @@ define void @with_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) {
 
 define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %stof = addrspacecast ptr addrspace(3) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -486,30 +486,30 @@ define amdgpu_kernel void @with_group_to_flat_addrspacecast_cc_kernel(ptr addrsp
 
 define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(4) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(4) %ptr
   ret void
 }
 
 define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   %stof = addrspacecast ptr addrspace(4) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -517,10 +517,10 @@ define void @with_constant_to_flat_addrspacecast(ptr addrspace(4) %ptr) {
 
 define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr addrspace(4) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %stof = addrspacecast ptr addrspace(4) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -528,30 +528,30 @@ define amdgpu_kernel void @with_constant_to_flat_addrspacecast_cc_kernel(ptr add
 
 define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   store volatile i32 0, ptr addrspace(5) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   store volatile i32 0, ptr addrspace(5) %ptr
   ret void
 }
 
 define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -559,10 +559,10 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 
 define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -570,50 +570,50 @@ define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addr
 
 define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -621,10 +621,10 @@ define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrsp
 
 define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -632,70 +632,70 @@ define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacec
 
 define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   call void @call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -704,10 +704,10 @@ define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(
 
 define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -716,10 +716,10 @@ define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_
 
 define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -728,10 +728,10 @@ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5)
 
 define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -742,10 +742,10 @@ define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_
 
 define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
   call void @without_alloca(i1 true)
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -753,10 +753,10 @@ define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr
 
 define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   call void @without_alloca(i1 true)
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -764,10 +764,10 @@ define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kern
 
 define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
   call void @without_alloca(i1 true)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -775,10 +775,10 @@ define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
 
 define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   call void @without_alloca(i1 true)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -790,10 +790,10 @@ define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(
 
 define void @with_indirect_call() {
 ; GFX9-LABEL: define void @with_indirect_call()
-; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL:[0-9]+]]
+; GFX9-SAME:  #[[ATTR6_GFX9_IND_CALL:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_indirect_call()
-; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL:[0-9]+]] {
+; GFX10-SAME:  #[[ATTR6_GFX10_IND_CALL:[0-9]+]] {
   %fptr = load ptr, ptr addrspace(4) @gv.fptr0
   call void %fptr()
   ret void
@@ -801,10 +801,10 @@ define void @with_indirect_call() {
 
 define amdgpu_kernel void @with_indirect_call_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR7_GFX9_IND_CALL2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR7_GFX10_IND_CALL2:[0-9]+]]
   %fptr = load ptr, ptr addrspace(4) @gv.fptr0
   call void %fptr()
   ret void
@@ -812,20 +812,20 @@ define amdgpu_kernel void @with_indirect_call_cc_kernel() {
 
 define void @call_with_indirect_call() {
 ; GFX9-LABEL: define void @call_with_indirect_call()
-; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL:[0-9]+]]
+; GFX9-SAME:  #[[ATTR6_GFX9_IND_CALL]]
 ;
 ; GFX10-LABEL: define void @call_with_indirect_call()
-; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL:[0-9]+]]
+; GFX10-SAME:  #[[ATTR6_GFX10_IND_CALL]]
   call void @with_indirect_call()
   ret void
 }
 
 define amdgpu_kernel void @call_with_indirect_call_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_IND_CALL2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR7_GFX9_IND_CALL2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_IND_CALL2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR7_GFX10_IND_CALL2]]
   call void @with_indirect_call()
   ret void
 }
@@ -840,10 +840,10 @@ define void @also_empty() {
 
 define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) {
 ; GFX9-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond)
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR8_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond)
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR8_GFX10_NOFSI3:[0-9]+]]
   %fptr = select i1 %cond, ptr @empty, ptr @also_empty
   call void %fptr()
   ret void
@@ -853,10 +853,10 @@ declare i32 @llvm.amdgcn.workgroup.id.x()
 
 define void @use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR10_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR10_GFX10_NOFSI4:[0-9]+]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %val, ptr addrspace(1) null
   ret void
@@ -864,10 +864,10 @@ define void @use_intrinsic_workitem_id_x() {
 
 define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %val, ptr addrspace(1) null
   ret void
@@ -875,30 +875,30 @@ define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
 
 define void @call_use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI4]]
+; GFX9-SAME:  #[[ATTR10_GFX9_NOFSI4]]
 ;
 ; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI4]]
+; GFX10-SAME:  #[[ATTR10_GFX10_NOFSI4]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
 define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR11_GFX9_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR11_GFX10_NOFSI5:[0-9]+]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
 define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr)
   store volatile i32 7, ptr %1, align 4
   ret void
@@ -906,65 +906,65 @@ define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
 
 define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
   call void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @with_inline_asm() {
 ; GFX9-LABEL: with_inline_asm
-; GFX9-SAME:  #[[ATTR_GFX9_NOFSI3]]
+; GFX9-SAME:  #[[ATTR8_GFX9_NOFSI3]]
 ;
 ; GFX10-LABEL: with_inline_asm
-; GFX10-SAME:  #[[ATTR_GFX10_NOFSI3]]
+; GFX10-SAME:  #[[ATTR8_GFX10_NOFSI3]]
   call void asm sideeffect "; use $0", "a"(i32 poison)
   ret void
 }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR0_GFX9_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR1_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR2_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR3_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR4_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; GFX9:  attributes #[[ATTR_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR5_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; GFX9:  attributes #[[ATTR_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR6_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR7_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR8_GFX9_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR10_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR_GFX9_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR11_GFX9_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
 
 
 
 
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR0_GFX10_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR1_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR2_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR3_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR4_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR5_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR6_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR7_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR8_GFX10_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR10_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR_GFX10_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR11_GFX10_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
index 55113a3b6f263e..c108b93766bcc8 100644
--- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
@@ -36,5 +36,5 @@ define amdgpu_kernel void @test_direct_indirect_call() {
 }
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index df6a75c7376df4..908746a0c7784f 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -43,5 +43,5 @@ attributes #0 = { "amdgpu-no-dispatch-id" }
 ; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" }
 ;.
 ; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
index b588337da2868d..79edfdee6b99ac 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call-2.ll
@@ -60,5 +60,5 @@ entry:
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; OW: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
-; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CW: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
 ;.

>From b31a7e56c0a69d0311671f2f733be62e7be9dc62 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Wed, 4 Sep 2024 16:50:38 -0700
Subject: [PATCH 7/8] Code improvement in the lambda.

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 13fbe9d8f53d33..b31c6238188b4c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -702,15 +702,16 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       const auto &CB = cast<CallBase>(I);
       const Function *Callee = CB.getCalledFunction();
 
-      if (Callee && Callee->isIntrinsic())
+      // Callee == 0 for inline asm or indirect call with known callees.
+      // In the latter case, updateImpl() already checked the callees and we
+      // know their FLAT_SCRATCH_INIT bit is set.
+      // If function has indirect call with unknown callees, the bit is
+      // already removed in updateImpl() and execution won't reach here.
+      if (!Callee)
+        return true;
+      else
         return Callee->getIntrinsicID() !=
                Intrinsic::amdgcn_addrspacecast_nonnull;
-
-      // Return true for all other cases, including (1)inline asm, (2)direct
-      // call, and (3)indirect call with known callees. For (2) and (3)
-      // updateImpl() already checked the callees and we know their
-      // FLAT_SCRATCH_INIT bit is set.
-      return true;
     };
 
     bool UsedAssumedInformation = false;

>From 74d9ef1c9bf94d7a9ea53ea4be486a7da4a76440 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Wed, 18 Sep 2024 15:55:50 -0700
Subject: [PATCH 8/8] (1) check constant for addrspacecast (2) remove alloca
 related tests.

---
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |  41 +-
 .../attributor-flatscratchinit-globalisel.ll  | 443 --------------
 .../AMDGPU/attributor-flatscratchinit.ll      | 556 +++---------------
 3 files changed, 133 insertions(+), 907 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index b31c6238188b4c..b3bcc979e23a5e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -435,13 +435,24 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       return;
     }
 
+    SmallPtrSet<const Constant *, 8> VisitedConsts;
+
     for (Instruction &I : instructions(F)) {
       if (isa<AddrSpaceCastInst>(I) &&
-          cast<AddrSpaceCastInst &>(I).getSrcAddressSpace() ==
+          cast<AddrSpaceCastInst>(I).getSrcAddressSpace() ==
               AMDGPUAS::PRIVATE_ADDRESS) {
         removeAssumedBits(FLAT_SCRATCH_INIT);
         return;
       }
+      // check for addrSpaceCast in constant expressions
+      for (const Use &U : I.operands()) {
+        if (const auto *C = dyn_cast<Constant>(U)) {
+          if (constHasASCast(C, VisitedConsts)) {
+            removeAssumedBits(FLAT_SCRATCH_INIT);
+            return;
+          }
+        }
+      }
     }
   }
 
@@ -709,9 +720,9 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
       // already removed in updateImpl() and execution won't reach here.
       if (!Callee)
         return true;
-      else
-        return Callee->getIntrinsicID() !=
-               Intrinsic::amdgcn_addrspacecast_nonnull;
+
+      return Callee->getIntrinsicID() !=
+             Intrinsic::amdgcn_addrspacecast_nonnull;
     };
 
     bool UsedAssumedInformation = false;
@@ -721,6 +732,28 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
     return !A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *this,
                                               UsedAssumedInformation);
   }
+
+  bool constHasASCast(const Constant *C,
+                      SmallPtrSetImpl<const Constant *> &Visited) {
+    if (!Visited.insert(C).second)
+      return false;
+
+    if (const auto *CE = dyn_cast<ConstantExpr>(C))
+      if (CE->getOpcode() == Instruction::AddrSpaceCast &&
+          CE->getOperand(0)->getType()->getPointerAddressSpace() ==
+              AMDGPUAS::PRIVATE_ADDRESS)
+        return true;
+
+    for (const Use &U : C->operands()) {
+      const auto *OpC = dyn_cast<Constant>(U);
+      if (!OpC || !Visited.insert(OpC).second)
+        continue;
+
+      if (constHasASCast(OpC, Visited))
+        return true;
+    }
+    return false;
+  }
 };
 
 AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
index 9efdf3cbb86067..9ae5782298a289 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
@@ -1,200 +1,6 @@
 ; Test the generation of the attribute amdgpu-no-flat-scratch-init
 ; RUN: opt -S -O2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -global-isel -stop-after=irtranslator | FileCheck -check-prefixes=GFX10 %s
 
-;; tests of alloca
-
-define void @without_alloca(i1 %arg0) {
-  store volatile i1 %arg0, ptr addrspace(1) null
-  ret void
-}
-
-define void @with_alloca() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
-  store volatile i1 %arg0, ptr addrspace(1) null
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_cc_kernel() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-; graphics functions won't get the attribute amdgpu-no-flat-scratch-init
-
-define amdgpu_vs void @with_alloca_cc_vs() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_gs void @with_alloca_cc_gs() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_ps void @with_alloca_cc_ps() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_cs void @with_alloca_cc_cs() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_hs void @with_alloca_cc_hs() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_ls void @with_alloca_cc_ls() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_es void @with_alloca_cc_es() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_gfx void @with_alloca_cc_gfx() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_cs_chain void @with_alloca_cc_cs_chain() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define void @call_without_alloca() {
-  call void @without_alloca(i1 true)
-  ret void
-}
-
-define amdgpu_kernel void @call_without_alloca_cc_kernel() {
-  call void @without_alloca(i1 true)
-  ret void
-}
-
-define void @call_with_alloca() {
-  call void @with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_with_alloca_cc_kernel() {
-  call void @with_alloca()
-  ret void
-}
-
-define void @call_both_with_and_without_alloca() {
-  call void @with_alloca()
-  call void @without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
-  call void @with_alloca()
-  call void @without_alloca()
-  ret void
-}
-
-define void @call_call_without_alloca() {
-  call void @call_without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
-  call void @call_without_alloca()
-  ret void
-}
-
-define void @call_call_with_alloca() {
-  call void @call_with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
-  call void @call_with_alloca()
-  ret void
-}
-
-define void @with_alloca_call_without_alloca() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @without_alloca()
-  ret void
-}
-
-define void @with_alloca_call_with_alloca() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @with_alloca()
-  ret void
-}
-
-define void @with_alloca_call_call_without_alloca() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_without_alloca()
-  ret void
-}
-
-define void @with_alloca_call_call_with_alloca() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_with_alloca()
-  ret void
-}
-
 ;; tests of addrspacecast
 
 define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
@@ -397,32 +203,6 @@ define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_
   ret void
 }
 
-;; tests of mixed alloca and addrspacecast
-
-define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) {
-  call void @without_alloca(i1 true)
-  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-  ret void
-}
-
-define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
-  call void @without_alloca(i1 true)
-  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-  ret void
-}
-
-define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
-  call void @without_alloca(i1 true)
-  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-  ret void
-}
-
-define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
-  call void @without_alloca(i1 true)
-  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-  ret void
-}
-
 ;; tests of indirect call, intrinsics
 
 @gv.fptr0 = external hidden unnamed_addr addrspace(4) constant ptr, align 4
@@ -473,201 +253,6 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
   ret void
 }
 
-; GFX10: name:            without_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            with_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            without_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            with_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            with_alloca_cc_vs
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
-;
-; GFX10: name:            with_alloca_cc_gs
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr5' }
-;
-; GFX10: name:            with_alloca_cc_ps
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
-;
-; GFX10: name:            with_alloca_cc_cs
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
-;
-; GFX10: name:            with_alloca_cc_hs
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr5' }
-;
-; GFX10: name:            with_alloca_cc_ls
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
-;
-; GFX10: name:            with_alloca_cc_es
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentWaveByteOffset: { reg: '$sgpr0' }
-;
-; GFX10: name:            with_alloca_cc_gfx
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:  psInputAddr:     0
-;
-; GFX10: name:            with_alloca_cc_cs_chain
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr48_sgpr49_sgpr50_sgpr51' }
-; GFX10-NEXT:  psInputAddr:     0
-;
-; GFX10: name:            with_alloca_cc_cs_chain_preserve
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr48_sgpr49_sgpr50_sgpr51' }
-; GFX10-NEXT:  psInputAddr:     0
-;
-; GFX10: name:            call_without_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            call_without_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
-;
-; GFX10: name:            call_with_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            call_with_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            call_both_with_and_without_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            call_both_with_and_without_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            call_call_without_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            call_call_without_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr4' }
-;
-; GFX10: name:            call_call_with_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            call_call_with_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            with_alloca_call_without_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            with_alloca_call_without_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            with_alloca_call_with_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            with_alloca_call_with_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            with_alloca_call_call_without_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            with_alloca_call_call_without_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            with_alloca_call_call_with_alloca
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            with_alloca_call_call_with_alloca_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    flatScratchInit: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
 ; GFX10: name:            without_global_to_flat_addrspacecast
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
@@ -921,34 +506,6 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
 ; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
 ;
-; GFX10: name:            call_without_alloca_and_without_addrspacecast
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            call_without_alloca_and_without_addrspacecast_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
-; GFX10: name:            call_without_alloca_and_with_addrspacecast
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    queuePtr:        { reg: '$sgpr6_sgpr7' }
-; GFX10-NEXT:    dispatchID:      { reg: '$sgpr10_sgpr11' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr12' }
-;
-; GFX10: name:            call_without_alloca_and_with_addrspacecast_cc_kernel
-; GFX10:       argumentInfo:
-; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
-; GFX10-NEXT:    kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT:    workGroupIDX:    { reg: '$sgpr6' }
-;
 ; GFX10: name:            with_indirect_call
 ; GFX10:       argumentInfo:
 ; GFX10-NEXT:    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
index b33ec280b034c9..2229ce9500cacf 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit.ll
@@ -2,378 +2,24 @@
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX9 %s
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=GFX10 %s
 
-;; tests of alloca
-
-define void @without_alloca(i1 %arg0) {
-; GFX9-LABEL: define void @without_alloca(i1 %arg0)
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI:[0-9]+]]
-;
-; GFX10-LABEL: define void @without_alloca(i1 %arg0)
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI:[0-9]+]]
-  store volatile i1 %arg0, ptr addrspace(1) null
-  ret void
-}
-
-define void @with_alloca() {
-; GFX9-LABEL: define void @with_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @with_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0) {
-; GFX9-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2:[0-9]+]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @without_alloca_cc_kernel(i1 %arg0)
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2:[0-9]+]]
-  store volatile i1 %arg0, ptr addrspace(1) null
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-; graphics functions won't get the attribute amdgpu-no-flat-scratch-init
-
-define amdgpu_vs void @with_alloca_cc_vs() {
-; GFX9-LABEL: define amdgpu_vs void @with_alloca_cc_vs()
-; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS:[0-9]+]]
-;
-; GFX10-LABEL: define amdgpu_vs void @with_alloca_cc_vs()
-; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS:[0-9]+]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_gs void @with_alloca_cc_gs() {
-; GFX9-LABEL: define amdgpu_gs void @with_alloca_cc_gs()
-; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
-;
-; GFX10-LABEL: define amdgpu_gs void @with_alloca_cc_gs()
-; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_ps void @with_alloca_cc_ps() {
-; GFX9-LABEL: define amdgpu_ps void @with_alloca_cc_ps()
-; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
-;
-; GFX10-LABEL: define amdgpu_ps void @with_alloca_cc_ps()
-; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_cs void @with_alloca_cc_cs() {
-; GFX9-LABEL: define amdgpu_cs void @with_alloca_cc_cs()
-; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
-;
-; GFX10-LABEL: define amdgpu_cs void @with_alloca_cc_cs()
-; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_hs void @with_alloca_cc_hs() {
-; GFX9-LABEL: define amdgpu_hs void @with_alloca_cc_hs()
-; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
-;
-; GFX10-LABEL: define amdgpu_hs void @with_alloca_cc_hs()
-; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_ls void @with_alloca_cc_ls() {
-; GFX9-LABEL: define amdgpu_ls void @with_alloca_cc_ls()
-; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
-;
-; GFX10-LABEL: define amdgpu_ls void @with_alloca_cc_ls()
-; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_es void @with_alloca_cc_es() {
-; GFX9-LABEL: define amdgpu_es void @with_alloca_cc_es()
-; GFX9-SAME:  #[[ATTR2_GFX9_CC_GRAPHICS]]
-;
-; GFX10-LABEL: define amdgpu_es void @with_alloca_cc_es()
-; GFX10-SAME:  #[[ATTR2_GFX10_CC_GRAPHICS]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_gfx void @with_alloca_cc_gfx() {
-; GFX9-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx()
-; GFX9-SAME:  #[[ATTR3_GFX9_CC_GRAPHICS2:[0-9]+]]
-;
-; GFX10-LABEL: define amdgpu_gfx void @with_alloca_cc_gfx()
-; GFX10-SAME:  #[[ATTR3_GFX10_CC_GRAPHICS2:[0-9]+]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_cs_chain void @with_alloca_cc_cs_chain() {
-; GFX9-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain()
-; GFX9-SAME:  #[[ATTR3_GFX9_CC_GRAPHICS2]]
-;
-; GFX10-LABEL: define amdgpu_cs_chain void @with_alloca_cc_cs_chain()
-; GFX10-SAME:  #[[ATTR3_GFX10_CC_GRAPHICS2]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve() {
-; GFX9-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve()
-; GFX9-SAME:  #[[ATTR3_GFX9_CC_GRAPHICS2]]
-;
-; GFX10-LABEL: define amdgpu_cs_chain_preserve void @with_alloca_cc_cs_chain_preserve()
-; GFX10-SAME:  #[[ATTR3_GFX10_CC_GRAPHICS2]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  ret void
-}
-
-define void @call_without_alloca() {
-; GFX9-LABEL: define void @call_without_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @call_without_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  call void @without_alloca(i1 true)
-  ret void
-}
-
-define amdgpu_kernel void @call_without_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  call void @without_alloca(i1 true)
-  ret void
-}
-
-define void @call_with_alloca() {
-; GFX9-LABEL: define void @call_with_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @call_with_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  call void @with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_with_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  call void @with_alloca()
-  ret void
-}
-
-define void @call_both_with_and_without_alloca() {
-; GFX9-LABEL: define void @call_both_with_and_without_alloca()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI3:[0-9]+]]
-;
-; GFX10-LABEL: define void @call_both_with_and_without_alloca()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI3:[0-9]+]]
-  call void @with_alloca()
-  call void @without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR_GFX9_NO_NOFSI4:[0-9]+]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR_GFX10_NO_NOFSI4:[0-9]+]]
-  call void @with_alloca()
-  call void @without_alloca()
-  ret void
-}
-
-define void @call_call_without_alloca() {
-; GFX9-LABEL: define void @call_call_without_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @call_call_without_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  call void @call_without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_call_without_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  call void @call_without_alloca()
-  ret void
-}
-
-define void @call_call_with_alloca() {
-; GFX9-LABEL: define void @call_call_with_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @call_call_with_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  call void @call_with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @call_call_with_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @call_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  call void @call_with_alloca()
-  ret void
-}
-
-define void @with_alloca_call_without_alloca() {
-; GFX9-LABEL: define void @with_alloca_call_without_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @with_alloca_call_without_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @without_alloca()
-  ret void
-}
-
-define void @with_alloca_call_with_alloca() {
-; GFX9-LABEL: define void @with_alloca_call_with_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @with_alloca_call_with_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @with_alloca()
-  ret void
-}
-
-define void @with_alloca_call_call_without_alloca() {
-; GFX9-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @with_alloca_call_call_without_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_without_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_without_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_without_alloca()
-  ret void
-}
-
-define void @with_alloca_call_call_with_alloca() {
-; GFX9-LABEL: define void @with_alloca_call_call_with_alloca()
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @with_alloca_call_call_with_alloca()
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_with_alloca()
-  ret void
-}
-
-define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel() {
-; GFX9-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @with_alloca_call_call_with_alloca_cc_kernel()
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  %temp = alloca i32, addrspace(5)
-  store volatile i32 0, ptr addrspace(5) %temp, align 4
-  call void @call_with_alloca()
-  ret void
-}
-
 ;; tests of addrspacecast
 
 define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
+; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @without_global_to_flat_addrspacecast(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
+; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI:[0-9]+]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
 
 define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @without_global_to_flat_addrspacecast_cc_kernel(ptr addrspace(1) %ptr)
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
+; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2:[0-9]+]]
   store volatile i32 0, ptr addrspace(1) %ptr
   ret void
 }
@@ -548,10 +194,10 @@ define amdgpu_kernel void @without_private_to_flat_addrspacecast_cc_kernel(ptr a
 
 define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_NO_NOFSI:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI:[0-9]+]]
+; GFX10-SAME:  #[[ATTR2_GFX10_NO_NOFSI:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -559,10 +205,10 @@ define void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 
 define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2:[0-9]+]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   ret void
@@ -590,30 +236,30 @@ define amdgpu_kernel void @call_without_private_to_flat_addrspacecast_cc_kernel(
 
 define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR2_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR2_GFX10_NO_NOFSI]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR2_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR2_GFX10_NO_NOFSI]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -621,10 +267,10 @@ define void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrsp
 
 define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
@@ -652,50 +298,50 @@ define amdgpu_kernel void @call_call_without_private_to_flat_addrspacecast_cc_ke
 
 define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR2_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR2_GFX10_NO_NOFSI]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   call void @call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR2_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @call_call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR2_GFX10_NO_NOFSI]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_call_both_with_and_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   call void @call_both_with_and_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
 define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR2_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR2_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -704,10 +350,10 @@ define void @with_cast_call_without_private_to_flat_addrspacecast(ptr addrspace(
 
 define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -716,10 +362,10 @@ define amdgpu_kernel void @with_cast_call_without_private_to_flat_addrspacecast_
 
 define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
+; GFX9-SAME:  #[[ATTR2_GFX9_NO_NOFSI]]
 ;
 ; GFX10-LABEL: define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
+; GFX10-SAME:  #[[ATTR2_GFX10_NO_NOFSI]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
@@ -728,59 +374,55 @@ define void @with_cast_call_with_private_to_flat_addrspacecast(ptr addrspace(5)
 
 define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_cast_call_with_private_to_flat_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   %stof = addrspacecast ptr addrspace(5) %ptr to ptr
   store volatile i32 0, ptr %stof
   call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
   ret void
 }
 
-;; tests of mixed alloca and addrspacecast
+;; tests of addrspacecast in a constant
 
-define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr) {
-; GFX9-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR0_GFX9_NOFSI]]
-;
-; GFX10-LABEL: define void @call_without_alloca_and_without_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR0_GFX10_NOFSI]]
-  call void @without_alloca(i1 true)
-  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+define amdgpu_kernel void @with_global_to_flat_in_const() {
+; GFX9-LABEL: define amdgpu_kernel void @with_global_to_flat_in_const()
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
+  store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(1) null to ptr addrspace(0))
   ret void
 }
 
-define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
-; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
+define amdgpu_kernel void @with_region_to_flat_in_const() {
+; GFX9-LABEL: define amdgpu_kernel void @with_region_to_flat_in_const()
 ; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_without_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR1_GFX10_NOFSI2]]
-  call void @without_alloca(i1 true)
-  call void @without_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+  store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(2) null to ptr addrspace(0))
   ret void
 }
 
-define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr) {
-; GFX9-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR4_GFX9_NO_NOFSI]]
-;
-; GFX10-LABEL: define void @call_without_alloca_and_with_addrspacecast(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR4_GFX10_NO_NOFSI]]
-  call void @without_alloca(i1 true)
-  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+define amdgpu_kernel void @with_local_to_flat_in_const() {
+; GFX9-LABEL: define amdgpu_kernel void @with_local_to_flat_in_const()
+; GFX9-SAME:  #[[ATTR1_GFX9_NOFSI2]]
+  store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(3) null to ptr addrspace(0))
   ret void
 }
 
-define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr) {
-; GFX9-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
-;
-; GFX10-LABEL: define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(ptr addrspace(5) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
-  call void @without_alloca(i1 true)
-  call void @with_private_to_flat_addrspacecast(ptr addrspace(5) %ptr)
+define amdgpu_kernel void @with_constant_to_flat_in_const() {
+  store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(3) null to ptr addrspace(0))
+  ret void
+}
+
+define amdgpu_kernel void @with_private_to_flat_in_const() {
+; GFX9-LABEL: define amdgpu_kernel void @with_private_to_flat_in_const()
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2:[0-9]+]]
+  store i32 7, ptr addrspace(0) addrspacecast (ptr addrspace(5) null to ptr addrspace(0))
+  ret void
+}
+
+define amdgpu_kernel void @call_with_private_to_flat_in_const() {
+; GFX9-LABEL: define amdgpu_kernel void @call_with_private_to_flat_in_const()
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2:[0-9]+]]
+  call void @with_private_to_flat_in_const()
   ret void
 }
 
@@ -790,10 +432,10 @@ define amdgpu_kernel void @call_without_alloca_and_with_addrspacecast_cc_kernel(
 
 define void @with_indirect_call() {
 ; GFX9-LABEL: define void @with_indirect_call()
-; GFX9-SAME:  #[[ATTR6_GFX9_IND_CALL:[0-9]+]]
+; GFX9-SAME:  #[[ATTR2_GFX9_IND_CALL:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @with_indirect_call()
-; GFX10-SAME:  #[[ATTR6_GFX10_IND_CALL:[0-9]+]] {
+; GFX10-SAME:  #[[ATTR2_GFX10_IND_CALL:[0-9]+]] {
   %fptr = load ptr, ptr addrspace(4) @gv.fptr0
   call void %fptr()
   ret void
@@ -801,10 +443,10 @@ define void @with_indirect_call() {
 
 define amdgpu_kernel void @with_indirect_call_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel()
-; GFX9-SAME:  #[[ATTR7_GFX9_IND_CALL2:[0-9]+]]
+; GFX9-SAME:  #[[ATTR3_GFX9_IND_CALL2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @with_indirect_call_cc_kernel()
-; GFX10-SAME:  #[[ATTR7_GFX10_IND_CALL2:[0-9]+]]
+; GFX10-SAME:  #[[ATTR3_GFX10_IND_CALL2:[0-9]+]]
   %fptr = load ptr, ptr addrspace(4) @gv.fptr0
   call void %fptr()
   ret void
@@ -812,20 +454,20 @@ define amdgpu_kernel void @with_indirect_call_cc_kernel() {
 
 define void @call_with_indirect_call() {
 ; GFX9-LABEL: define void @call_with_indirect_call()
-; GFX9-SAME:  #[[ATTR6_GFX9_IND_CALL]]
+; GFX9-SAME:  #[[ATTR4_GFX9_IND_CALL:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_with_indirect_call()
-; GFX10-SAME:  #[[ATTR6_GFX10_IND_CALL]]
+; GFX10-SAME:  #[[ATTR4_GFX10_IND_CALL:[0-9]+]]
   call void @with_indirect_call()
   ret void
 }
 
 define amdgpu_kernel void @call_with_indirect_call_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel()
-; GFX9-SAME:  #[[ATTR7_GFX9_IND_CALL2]]
+; GFX9-SAME:  #[[ATTR5_GFX9_IND_CALL2:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_with_indirect_call_cc_kernel()
-; GFX10-SAME:  #[[ATTR7_GFX10_IND_CALL2]]
+; GFX10-SAME:  #[[ATTR5_GFX10_IND_CALL2:[0-9]+]]
   call void @with_indirect_call()
   ret void
 }
@@ -840,10 +482,10 @@ define void @also_empty() {
 
 define amdgpu_kernel void @indirect_call_known_callees(i1 %cond) {
 ; GFX9-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond)
-; GFX9-SAME:  #[[ATTR8_GFX9_NOFSI3:[0-9]+]]
+; GFX9-SAME:  #[[ATTR6_GFX9_NOFSI3:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @indirect_call_known_callees(i1 %cond)
-; GFX10-SAME:  #[[ATTR8_GFX10_NOFSI3:[0-9]+]]
+; GFX10-SAME:  #[[ATTR6_GFX10_NOFSI3:[0-9]+]]
   %fptr = select i1 %cond, ptr @empty, ptr @also_empty
   call void %fptr()
   ret void
@@ -853,10 +495,10 @@ declare i32 @llvm.amdgcn.workgroup.id.x()
 
 define void @use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR10_GFX9_NOFSI4:[0-9]+]]
+; GFX9-SAME:  #[[ATTR8_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR10_GFX10_NOFSI4:[0-9]+]]
+; GFX10-SAME:  #[[ATTR8_GFX10_NOFSI4:[0-9]+]]
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %val, ptr addrspace(1) null
   ret void
@@ -875,30 +517,30 @@ define amdgpu_kernel void @use_intrinsic_workitem_id_x_cc_kernel() {
 
 define void @call_use_intrinsic_workitem_id_x() {
 ; GFX9-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX9-SAME:  #[[ATTR10_GFX9_NOFSI4]]
+; GFX9-SAME:  #[[ATTR6_GFX9_NOFSI4:[0-9]+]]
 ;
 ; GFX10-LABEL: define void @call_use_intrinsic_workitem_id_x()
-; GFX10-SAME:  #[[ATTR10_GFX10_NOFSI4]]
+; GFX10-SAME:  #[[ATTR6_GFX10_NOFSI4:[0-9]+]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
 define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
 ; GFX9-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX9-SAME:  #[[ATTR11_GFX9_NOFSI5:[0-9]+]]
+; GFX9-SAME:  #[[ATTR9_GFX9_NOFSI5:[0-9]+]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel()
-; GFX10-SAME:  #[[ATTR11_GFX10_NOFSI5:[0-9]+]]
+; GFX10-SAME:  #[[ATTR9_GFX10_NOFSI5:[0-9]+]]
   call void @use_intrinsic_workitem_id_x()
   ret void
 }
 
 define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   %1 = call ptr @llvm.amdgcn.addrspacecast.nonnull.p0.p3(ptr addrspace(3) %ptr)
   store volatile i32 7, ptr %1, align 4
   ret void
@@ -906,20 +548,20 @@ define amdgpu_kernel void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
 
 define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr) {
 ; GFX9-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX9-SAME:  #[[ATTR5_GFX9_NO_NOFSI2]]
+; GFX9-SAME:  #[[ATTR3_GFX9_NO_NOFSI2]]
 ;
 ; GFX10-LABEL: define amdgpu_kernel void @call_calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
-; GFX10-SAME:  #[[ATTR5_GFX10_NO_NOFSI2]]
+; GFX10-SAME:  #[[ATTR3_GFX10_NO_NOFSI2]]
   call void @calls_intrin_ascast_cc_kernel(ptr addrspace(3) %ptr)
   ret void
 }
 
 define amdgpu_kernel void @with_inline_asm() {
 ; GFX9-LABEL: with_inline_asm
-; GFX9-SAME:  #[[ATTR8_GFX9_NOFSI3]]
+; GFX9-SAME:  #[[ATTR6_GFX9_NOFSI3]]
 ;
 ; GFX10-LABEL: with_inline_asm
-; GFX10-SAME:  #[[ATTR8_GFX10_NOFSI3]]
+; GFX10-SAME:  #[[ATTR6_GFX10_NOFSI3]]
   call void asm sideeffect "; use $0", "a"(i32 poison)
   ret void
 }
@@ -928,21 +570,18 @@ define amdgpu_kernel void @with_inline_asm() {
 
 ; GFX9:  attributes #[[ATTR1_GFX9_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR2_GFX9_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; GFX9:  attributes #[[ATTR3_GFX9_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-
-; GFX9:  attributes #[[ATTR4_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR2_GFX9_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR5_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR3_GFX9_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR6_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; GFX9:  attributes #[[ATTR7_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR4_GFX9_IND_CALL]] = { "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR5_GFX9_IND_CALL2]] = { "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR8_GFX9_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR6_GFX9_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR10_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR8_GFX9_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
-; GFX9:  attributes #[[ATTR11_GFX9_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; GFX9:  attributes #[[ATTR9_GFX9_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
 
 
 
@@ -953,18 +592,15 @@ define amdgpu_kernel void @with_inline_asm() {
 
 ; GFX10:  attributes #[[ATTR1_GFX10_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR2_GFX10_CC_GRAPHICS]] = { "amdgpu-no-agpr" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR3_GFX10_CC_GRAPHICS2]] = { "amdgpu-no-agpr" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-
-; GFX10:  attributes #[[ATTR4_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR2_GFX10_NO_NOFSI]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR5_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR3_GFX10_NO_NOFSI2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR6_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
-; GFX10:  attributes #[[ATTR7_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR4_GFX10_IND_CALL]] = { "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR5_GFX10_IND_CALL2]] = { "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR8_GFX10_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR6_GFX10_NOFSI3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR10_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR8_GFX10_NOFSI4]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,20" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
 
-; GFX10:  attributes #[[ATTR11_GFX10_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }
+; GFX10:  attributes #[[ATTR9_GFX10_NOFSI5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx1010" "uniform-work-group-size"="false" }



More information about the cfe-commits mailing list